SENDAs Agreement 1 Update 2010-2022
First step of deduplication process. Exploratory data analysis was conducted, addressing issues such as data entry errors, missing values, and the conversion of the date of birth into the age at the time of the first discharge for each individual.
Data Loading and Exploration
Loading Packages and uniting databases
Proceed to load the necessary packages.
Code
invisible("Only run from Ubuntu")
if (!(Sys.getenv("RSTUDIO_SESSION_TYPE") == "server" || file.exists("/.dockerenv"))) {
if(Sys.info()["sysname"]!="Windows"){
Sys.setenv(RETICULATE_PYTHON = "/home/fondecytacc/.pyenv/versions/3.11.5/bin/python")
}
}
#clean enviroment
rm(list = ls()); gc()
if (!(Sys.getenv("RSTUDIO_SESSION_TYPE") == "server" || file.exists("/.dockerenv"))) {
file.path(paste0(gsub("/cons","",gsub("cons","",paste0(getwd(),"/cons"))),"data/20241015_out"))
wdpath<-
paste0(gsub("/cons","",gsub("cons","",paste0(getwd(),"/cons"))))
wdpath
envpath<- if(regmatches(wdpath, regexpr("[A-Za-z]+", wdpath))=="G"){"G:/Mi unidad/Alvacast/SISTRAT 2023/"}else{"E:/Mi unidad/Alvacast/SISTRAT 2023/"}
envpath
base::load(paste0(wdpath,"data/20241015_out/","2_ndp_2025_03_08.Rdata"))
} else {
file.path(paste0(getwd(),"/_input"))
paste0(getwd(),"/_input","/2_ndp_2025_03_08.Rdata")
base::load(paste0(getwd(),"/_input","/2_ndp_2025_03_08.Rdata.enc"))
}
time_before_dedup1<-Sys.time()
password <- Sys.getenv("PASS_PPIO")
system(sprintf("7z x path/to/_input/2_ndp_2025_03_08.Rdata.7z.001 -p'%s'", password)) used (Mb) gc trigger (Mb) max used (Mb)
Ncells 606397 32.4 1280870 68.5 1103091 59.0
Vcells 1229376 9.4 8388608 64.0 2150045 16.5
[1] 127
Code
#https://github.com/rstudio/renv/issues/544
#renv falls back to copying rather than symlinking, which is evidently very slow in this configuration.
renv::settings$use.cache(FALSE)
#only use explicit dependencies (in DESCRIPTION)
renv::settings$snapshot.type("implicit")
#check if rstools is installed
try(installr::install.Rtools(check_r_update=F))Code
if(quarto::quarto_version()>="1.7.29"){
stop("You need to install a recent quarto version") # la publicada el 28-abr-2025
}Error: You need to install a recent quarto version
Code
#change repository to CL
local({
r <- getOption("repos")
r["CRAN"] <- "https://cran.dcc.uchile.cl/"
options(repos=r)
})
if(!require(pacman)){install.packages("pacman");require(pacman)}Code
if(!require(pak)){install.packages("pak");require(pak)}Code
pacman::p_unlock(lib.loc = .libPaths()) #para no tener problemas reinstalando paquetesCode
if(Sys.info()["sysname"]=="Windows"){
if (getRversion() != "4.4.1") { stop("Requires R version 4.4.1; Actual: ", getRversion()) }
}
#check docker
check_docker_running <- function() {
# Try running 'docker info' to check if Docker is running
system("docker info", intern = TRUE, ignore.stderr = TRUE)
}
install_docker <- function() {
# Open the Docker Desktop download page in the browser for installation
browseURL("https://www.docker.com/products/docker-desktop")
}
# Main logic
if (inherits(try(check_docker_running(), silent = TRUE), "try-error")) {
liftr::install_docker()
} else {
message("Docker is running.")
}Warning in system(“docker info”, intern = TRUE, ignore.stderr = TRUE): el comando ejecutado ‘docker info’ tiene el estatus 1
Code
#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_
#PACKAGES#######################################################################
#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_
unlink("*_cache", recursive=T)
# ----------------------------------------------------------------------
# 2. Use a single pak::pkg_install() call for most CRAN packages
# ----------------------------------------------------------------------
paks <-
c(#"git",
# To connect to github
"gh", #interface for GitHub API from R
#
"gitcreds", # manages Git credentials (usernames, passwords, tokens)
#
"usethis", # simplifies common project setup tasks for R developers
# Package to bring packages in development
"devtools",
# Package administration
"renv",
# To manipulate data
"knitr", "pander", "DT",
# Join
"fuzzyjoin",
# For tables
"tidyverse", "janitor",
# For contingency tables
"kableExtra",
# For connections with python
"reticulate",
# To manipulate big data
"polars", "sqldf",
# To bring big databases
"nanoparquet",
# Interface for R and RStudio in R
"installr", "rmarkdown", "quarto", "yaml", #"rstudioapi",
# Time handling
"clock",
# Combine plots
"ggpubr",
# Parallelized iterative processing
"furrr",
# Work like a tibble with a data.table database
"tidytable",
# Split database into training and testing
"caret",
# Impute missing data
"missRanger", "mice",
# To modularize tasks
"job",
# For PhantomJS install checks
"webshot"
)
# dplyr
# janitor
# reshape2
# tidytable
# arrow
# boot
# broom
# car
# caret
# data.table
# DiagrammeR
# DiagrammeRsvg
# dplyr
# epiR
# epitools
# ggplot2
# glue
# htmlwidgets
# knitr
# lubridate
# naniar
# parallel
# polycor
# pROC
# psych
# readr
# rio
# rsvg
# scales
# stringr
# tableone
# rmarkdown
# biostat3
# codebook
# finalfit
# Hmisc
# kableExtra
# knitr
# devtools
# tidyr
# stringi
# stringr
# muhaz
# sqldf
# compareGroups
# survminer
# lubridate
# ggfortify
# car
# fuzzyjoin
# compareGroups
# caret
# job
# htmltools
# nanoparquet
# ggpubr
# polars
# installr
# clock
# pander
# reshape
# mice
# missRanger
# VIM
# withr
# biostat3
# broom
# glue
# finalfit
# purrr
# sf
# pak::pkg_install(paks)
pak::pak_sitrep()
# pak::sysreqs_check_installed(unique(unlist(paks)))
#pak::lockfile_create(unique(unlist(paks)), "dependencies_duplicates24.lock", dependencies=T)
#pak::lockfile_install("dependencies_duplicates24.lock")
#https://rdrr.io/cran/pak/man/faq.html
#pak::cache_delete()
library(tidytable)Code
library(polars)Warning: package ‘polars’ was built under R version 4.4.3
Code
library(ggplot2)
library(readr)
# ----------------------------------------------------------------------
# 3. Activate polars code completion (safe to try even if it fails)
# ----------------------------------------------------------------------
try(polars_code_completion_activate())Code
# ----------------------------------------------------------------------
# 4. BPMN from GitHub (not on CRAN, so install via devtools if missing)
# ----------------------------------------------------------------------
if (!requireNamespace("bpmn", quietly = TRUE)) {
devtools::install_github("bergant/bpmn")
}
# ----------------------------------------------------------------------
# 5. PhantomJS Check (use webshot if PhantomJS is missing)
# ----------------------------------------------------------------------
if (!webshot::is_phantomjs_installed()) {
webshot::install_phantomjs()
}
#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_
#FUNCTIONS######################################################################
#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_
copiar_nombres <- function(x,row.names=FALSE,col.names=TRUE,dec=",",...) {
if(class(try(dplyr::ungroup(x)))[1]=="tbl_df"){
if(options()$OutDec=="."){
options(OutDec = dec)
write.table(format(data.frame(x)),"clipboard",sep="\t",row.names=FALSE,col.names=col.names,...)
options(OutDec = ".")
return(x)
} else {
options(OutDec = ",")
write.table(format(data.frame(x)),"clipboard",sep="\t",row.names=FALSE,col.names=col.names,...)
options(OutDec = ",")
return(x)
}
} else {
if(options()$OutDec=="."){
options(OutDec = dec)
write.table(format(x),"clipboard",sep="\t",row.names=FALSE,col.names=col.names,...)
options(OutDec = ".")
return(x)
} else {
options(OutDec = ",")
write.table(format(x),"clipboard",sep="\t",row.names=FALSE,col.names=col.names,...)
options(OutDec = ",")
return(x)
}
}
}
#WINDOWS do not restrict memory size
if(.Platform$OS.type == "windows") withAutoprint({
memory.size()
memory.size(TRUE)
memory.limit()
})Warning: ‘memory.size()’ is no longer supported
Warning: ‘memory.size()’ is no longer supported
Warning: ‘memory.limit()’ is no longer supported
Code
memory.limit(size=56000)Warning: ‘memory.limit()’ is no longer supported
Code
#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:
#NAs are replaced with "" in knitr kable
options(knitr.kable.NA = '')
pander::panderOptions('big.mark', ',')
pander::panderOptions('decimal.mark', '.')
#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:
#
#to format rows in bold
format_cells <- function(df, rows ,cols, value = c("italics", "bold", "strikethrough")){
# select the correct markup
# one * for italics, two ** for bold
map <- setNames(c("*", "**", "~~"), c("italics", "bold", "strikethrough"))
markup <- map[value]
for (r in rows){
for(c in cols){
# Make sure values are not factors
df[[c]] <- as.character( df[[c]])
# Update formatting
df[r, c] <- ifelse(nchar(df[r, c])==0,"",paste0(markup, gsub(" ", "", df[r, c]), markup))
}
}
return(df)
}
#To produce line breaks in messages and warnings
knitr::knit_hooks$set(
error = function(x, options) {
paste('\n\n<div class="alert alert-danger" style="font-size: small !important;">',
gsub('##', '\n', gsub('^##\ Error', '**Error**', x)),
'</div>', sep = '\n')
},
warning = function(x, options) {
paste('\n\n<div class="alert alert-warning" style="font-size: small !important;">',
gsub('##', '\n', gsub('^##\ Warning:', '**Warning**', x)),
'</div>', sep = '\n')
},
message = function(x, options) {
paste('<div class="message" style="font-size: small !important;">',
gsub('##', '\n', x),
'</div>', sep = '\n')
}
)
#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:
sum_dates <- function(x){
cbind.data.frame(
min= as.Date(min(unclass(as.Date(x)), na.rm=T), origin = "1970-01-01"),
p001= as.Date(quantile(unclass(as.Date(x)), .001, na.rm=T), origin = "1970-01-01"),
p005= as.Date(quantile(unclass(as.Date(x)), .005, na.rm=T), origin = "1970-01-01"),
p025= as.Date(quantile(unclass(as.Date(x)), .025, na.rm=T), origin = "1970-01-01"),
p25= as.Date(quantile(unclass(as.Date(x)), .25, na.rm=T), origin = "1970-01-01"),
p50= as.Date(quantile(unclass(as.Date(x)), .5, na.rm=T), origin = "1970-01-01"),
p75= as.Date(quantile(unclass(as.Date(x)), .75, na.rm=T), origin = "1970-01-01"),
p975= as.Date(quantile(unclass(as.Date(x)), .975, na.rm=T), origin = "1970-01-01"),
p995= as.Date(quantile(unclass(as.Date(x)), .995, na.rm=T), origin = "1970-01-01"),
p999= as.Date(quantile(unclass(as.Date(x)), .999, na.rm=T), origin = "1970-01-01"),
max= as.Date(max(unclass(as.Date(x)), na.rm=T), origin = "1970-01-01")
)
}
#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:
# Define the function adapted for Polars
sum_dates_polars <- function(df, date_col) {
# Create the list of quantiles
quantiles <- c(0.001, 0.005, 0.025, 0.25, 0.5, 0.75, 0.975, 0.995, 0.999)
# Create expressions to calculate min and max
expr_list <- list(
pl$col(date_col)$min()$alias("min"),
pl$col(date_col)$max()$alias("max")
)
# Add expressions for quantiles
for (q in quantiles) {
expr_list <- append(expr_list, pl$col(date_col)$quantile(q)$alias(paste0("p", sub("\\.", "", as.character(q)))))
}
# Apply the expressions and return a DataFrame with the results
df$select(expr_list)
}
# Custom function for sampling with a seed
sample_n_with_seed <- function(data, size, seed) {
set.seed(seed)
dplyr::sample_n(data, size)
}
# Function to get the most frequent value
most_frequent <- function(x) {
uniq_vals <- unique(x)
freq_vals <- sapply(uniq_vals, function(val) sum(x == val))
most_freq <- uniq_vals[which(freq_vals == max(freq_vals))]
if (length(most_freq) == 1) {
return(most_freq)
} else {
return(NA)
}
}
#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_
#CONFIG #######################################################################
#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_
options(scipen=2) #display numbers rather scientific number
#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_Error in contrib.url(repos, "source") :
trying to use CRAN without setting a mirror
* pak version:
- 0.8.0.1
* Version information:
- pak platform: x86_64-w64-mingw32 (current: x86_64-w64-mingw32, compatible)
- pak repository: - (local install?)
* Optional packages installed:
- pillar
* Library path:
- G:/My Drive/Alvacast/SISTRAT 2023/renv/library/windows/R-4.4/x86_64-w64-mingw32
- C:/Program Files/R/R-4.4.1/library
* pak is installed at G:/My Drive/Alvacast/SISTRAT 2023/renv/library/windows/R-4.4/x86_64-w64-mingw32/pak.
* Dependency versions:
- callr 3.7.6
- cli 3.6.2
- curl 5.2.1
- desc 1.4.3
- filelock 1.0.3
- jsonlite 1.8.8
- lpSolve 5.6.23.9000
- pkgbuild 1.4.4
- pkgcache 2.2.2.9000
- pkgdepends 0.7.2.9000
- pkgsearch 3.1.3.9000
- processx 3.8.4
- ps 1.7.6
- R6 2.5.1
- zip 2.3.1
* Dependencies can be loaded
> memory.size()
[1] Inf
> memory.size(TRUE)
[1] Inf
> memory.limit()
[1] Inf
[1] Inf
To assess the main goals of the study, we first focused on distinguishing each user across the yearly datasets obtained from SENDA (1). Next, we separated each user’s treatments (2). Finally, we normalized, standardized, and cleaned each treatment (3). Although these stages may appear conceptually separate and sequential, they are interdependent (e.g., some variables needed to be standardized to identify duplicate entries). Throughout this document, we use the terms “rows”, “cases”, “observations” or “treatment episodes” interchangeably to refer to entries in the dataset.
0. Correct dates
In this section, we address missing or inconsistent dates within the dataset. First, we add a row number to uniquely identify each observation, especially useful for tracking deleted rows. Then, we replace missing birth dates with a default value, recording the manual correction in an observation column (OBS). Next, for missing admission dates, we attempt to replace them with corresponding SENDA dates where available, again updating OBS. Finally, we handle problematic discharge dates by examining rows with missing or unusual values and making targeted corrections. Specifically, we use conditional logic to update discharge dates based on unique hash_key and adm_date combinations, ensuring consistency in treatment timelines. Any remaining issues with discharge date parsing are logged for further review, and prior versions of the discharge date column are removed to maintain consistency.
Code
invisible("HASH_KEY for hash_key")
colnames(SISTRAT23_c1_2010_2022_df2)[colnames(SISTRAT23_c1_2010_2022_df2) == "HASH_KEY"] <- "hash_key"
invisible("Added row number to identify deleted observations")
SISTRAT23_c1_2010_2022_df2$rn <-1:nrow(SISTRAT23_c1_2010_2022_df2)
#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_
invisible("check problems with values in each variable of the dataset")
colnames_c1oct<-
setdiff(names(SISTRAT23_c1_2010_2022_df2), c("rn", "edad", "birth_date", "hash_key", "codigo_identificacion", "dias_en_tratamiento", "n_meses_en_tratamiento", "edad_inicio_consumo", "edad_inicio_sustancia_principal", "id_centro", "adm_date", "senda_adm_date", "discharge_date", "fecha_ultimo_tratamiento", "fecha_ingreso_a_tratamiento", "fecha_ingreso_a_convenio_senda", "fecha_egreso_de_tratamiento", "n_meses_en_senda", "dias_en_senda"))
unique_values_list_c1_duplicates24 <- setNames(
lapply(colnames_c1oct, function(col_name) {
SISTRAT23_c1_2010_2022_df2 |>
select(all_of(col_name)) |>
distinct() |>
pull()
}),
colnames_c1oct
)
unique_values_list_c1_duplicates24_df<- list_to_df(unique_values_list_c1_duplicates24)
#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_
#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_
#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_
invisible("problems with birth, adm or disch date")
#glimpse(slice(SISTRAT23_c1_2010_2022_df,problems_birth_date_c1$row))
invisible("replace missing birth date")
SISTRAT23_c1_2010_2022_df2$birth_date[problems_birth_date_c1$row]<- as.Date("1946-02-02")
SISTRAT23_c1_2010_2022_df2$OBS <- ifelse(
SISTRAT23_c1_2010_2022_df2$rn %in% problems_birth_date_c1$row,
"0.a.Corrected birth date manually",
""
)
#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_
invisible("replace missing admission date")
if (is.na(SISTRAT23_c1_2010_2022_df2$adm_date[problems_adm_date_c1$row])) {
SISTRAT23_c1_2010_2022_df2$adm_date[problems_adm_date_c1$row] <- SISTRAT23_c1_2010_2022_df2$senda_adm_date[problems_adm_date_c1$row]
}
SISTRAT23_c1_2010_2022_df2$OBS <- ifelse(
SISTRAT23_c1_2010_2022_df2$rn %in% problems_adm_date_c1$row,
"0.b.Replaced birth date manually (w/ SENDA adm date)",
SISTRAT23_c1_2010_2022_df2$OBS
)
#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_
invisible("try to replace missing discharge dates")
# cbind.data.frame(rn= problems_discharge_date$row, SISTRAT23_c1_2010_2022_df[problems_discharge_date$row, c("hash_key", "adm_date", "birth_date", "fecha_egresode_tratamiento","diasen_tratamiento","discharge_date")]) %>% View()
# SISTRAT23_c1_2010_2022_df %>%
# tidytable::mutate(rn=tidytable::row_number()) %>%
# tidytable::filter(hash_key %in% unique(SISTRAT23_c1_2010_2022_df$hash_key[problems_discharge_date$row])) %>%
# tidytable::select(rn, hash_key, adm_date, birth_date, fecha_egresode_tratamiento, diasen_tratamiento, discharge_date) %>% View()
#readr::parse_date(discharge_date, format="%d-%m-%Y")
# as.Date("2009-12-30")-as.Date("2009-06-30")
invisible("")
SISTRAT23_c1_2010_2022_df_prev00<-
SISTRAT23_c1_2010_2022_df2 |>
tidytable::mutate(disch_date = tidytable::case_when(
#rn== 1294
hash_key=="c4795829b6ea9cfc50b988c85deb391fa041d99a0ebca6b68a1378f37e3eb420" & adm_date=="2009-06-30" ~ "2019-12-30",
#rn== 1934
hash_key=="23874d59570adaac6690c85481b869570c10c2f8931fc20636037cdff04af067" & adm_date=="2008-07-02" ~ "2009-05-13",
#rn== 1938
hash_key=="5a16413f76625a09585c89fd3ea4fb05d1ea5cbfbc18247a9fb6e7e21534562d" & adm_date=="2008-07-23" ~ "2009-04-14",
#rn== 2602
hash_key=="11b143acdce4bf1d3a72acd4a703ea8c38543fd02585b4f3b0433e227929ed3c" & adm_date=="2008-03-04" ~ "2009-09-15",
#rn== 2603
hash_key=="986ded00e6ca834805a169ed528655e22f819bf5104d1729b2e1453f20f38065" & adm_date=="2008-12-05" ~ "2009-06-02",
#rn== 2604
hash_key=="d402a1e13f25b2411ca346b0dc84b9fffa45887e628abf09262777b6deae85aa" & adm_date=="2009-06-09" ~ "2009-06-09",
#rn== 2896
hash_key=="0d248b372c7224ae2cc1cabb750d6201150175b5d65ec0397ff2127d32b6b675" & adm_date=="2009-02-05" ~ "2009-03-09",
#rn== 3198
hash_key== "6eb67e1ead556eb1dbd21951747440057a17a872b33b468a37c9bf781219cef8" & adm_date=="2009-10-07" ~ "2010-04-10",
#rn== 3260
hash_key=="e0acff1477306ee93abfca7e251cc6d23db916b390a9fe506fbbefc371ce1d43" & adm_date=="2009-12-07" ~ "2010-06-01",
#rn== 5175
hash_key=="eb13b44585501a35df9ce6d262ca6e69e4aa34063af219e19cc95e7609e38cdf" & adm_date=="2010-04-26" ~ "2011-05-03",
#rn== 5760
hash_key=="058e8b2c02f98d488a78d78d80435e516c6628cd7edb87ecaf9f8c981d9614ba" & adm_date=="2010-05-03" ~ "2010-10-04", #rn== 6354
hash_key=="4d42363412d6a435dd2762bbee7f9b4fe4117ff4c94d55e10472342156238ccb" & adm_date=="2010-06-17" ~ "2010-07-01",
#rn== 5760
hash_key=="058e8b2c02f98d488a78d78d80435e516c6628cd7edb87ecaf9f8c981d9614ba" & adm_date=="2010-05-03" ~ "2010-10-04",
#rn== 8176
hash_key=="228fc5b7b88c5f544f71f9ecfbad4d1750470b717f869a7aa9f01b0169a5d890" & adm_date=="2010-07-01" ~ "2011-01-13",
#rn== 8756
hash_key=="7ebe4155bb7741beef0f30ce47ecbc735bd1f7137d22e81ba21d5f12f8398fa2" & adm_date=="2010-10-04" ~ "2011-01-31",
#rn== 5760
hash_key=="058e8b2c02f98d488a78d78d80435e516c6628cd7edb87ecaf9f8c981d9614ba" & adm_date=="2010-05-03" ~ "2010-10-04",
#rn== 9092
hash_key=="93478aa27b121dbad91cb8e36ef60caa42fce6ca5b99478a77e9b8478df600f3" & adm_date=="2010-11-23" ~ "2011-01-14",
#rn== 9171
hash_key=="6500209f17b52ab4e00a140f7c8f0a10d9b073f81ac9443203f0a1b84c4dc1e8" & adm_date=="2010-11-25" ~ "2011-06-10",
#rn== 9177
hash_key=="4d6e97bfc2aeb15a8c6457ad1c84335de48b5456177b9749159ec2974537634f" & adm_date=="2010-11-25" ~ "2011-06-20",
#rn== 9444
hash_key=="1d5a63a966cea8241228f0057a38ef4e63e0fb353dda174dc95d4393e4cdcefa" & adm_date=="2010-12-02" ~ "2011-06-10",
#rn== 10424
hash_key=="eb13b44585501a35df9ce6d262ca6e69e4aa34063af219e19cc95e7609e38cdf" & adm_date=="2010-04-26" ~ "2011-05-03",
#rn== 11482
hash_key=="228fc5b7b88c5f544f71f9ecfbad4d1750470b717f869a7aa9f01b0169a5d890" & adm_date=="2010-07-01" ~ "2011-01-13",
#rn== 12097
hash_key=="6500209f17b52ab4e00a140f7c8f0a10d9b073f81ac9443203f0a1b84c4dc1e8" & adm_date=="2010-11-25" ~ "2011-06-10", #rn== 12102
hash_key=="4d6e97bfc2aeb15a8c6457ad1c84335de48b5456177b9749159ec2974537634f" & adm_date=="2010-11-25" ~ "2011-06-20", #rn== 12301
hash_key=="1d5a63a966cea8241228f0057a38ef4e63e0fb353dda174dc95d4393e4cdcefa" & adm_date=="2010-12-02" ~ "2011-06-10", #rn== 13086
hash_key=="c75bb8c43963dbad7a1b311497073a58b0e97bb82c5c63a4bc7ae4d1c9014592" & adm_date=="2011-01-13" ~ "2011-07-10",
#rn== 13644
hash_key=="f40999d751e9eb84f5ed6d832d96a1de872599c181e28dd420507c58d7464ccf" & adm_date=="2011-02-08" ~ "2011-08-04",
#rn== 14099
hash_key=="dbe7ddec7591332da15c3c4a1d4a2a1559d455a67b6c31a390ea546ea259c045" & adm_date=="2011-02-10" ~ "2011-05-03",
#rn== 14339
hash_key=="05ff2bf96ef3a294c09b39cf91c19f7a74b080487f13f62c449812f14cefff37" & adm_date=="2011-03-22" ~ "2011-07-31",
#rn== 15403
hash_key=="bdf81829448433489a21d8ac17de96f3765707798d8e2beb7653414f43f272aa" & adm_date=="2011-04-15" ~ "2011-06-12",
#rn== 16016
hash_key=="0bd45263c5217ae4324c23ca4bfec945d4100276fcac4e3e66ad5b6f5341d3fd" & adm_date=="2011-05-20" ~ "2011-06-01",
#rn== 16150
hash_key=="d6d0aaa21c50981871615a6b8886d1f69a3d0f125165f63f6a1c54729be5eea2" & adm_date=="2011-05-23" ~ "2011-06-05",
#rn== 16413
hash_key=="4728851a593a1490d73682e45945fe0f253d0f18dfc12aa1d2d21deef206c39c" & adm_date=="2011-04-18" ~ "2011-08-30",
#rn== 16742
hash_key=="caafb47faaab3c9637821a50ce4dcef33b8e3a9fc275f0ef76f0c93681eb15ba" & adm_date=="2011-06-06" ~ "2011-07-04",
#rn== 16745
hash_key=="18096679bef8db59dbd0ca3be91fa36d7d9dcbbf06b85be2662f410d0146d1a2" & adm_date=="2011-06-17" ~ "2011-07-31",
#rn== 16755
hash_key=="40d3ff594c6c3ddd96e37e5e53fbd22030916a99a4f04cf6283ad188058f2a5b" & adm_date=="2011-06-23" ~ "2011-07-07",
#rn== 17500
hash_key=="667766680894eb203756044682c8445365bb0a831012ec49341b080390133d5d" & adm_date=="2011-06-20" ~ "2011-08-02",
#rn== 30449
hash_key=="60e3066c438a10246353d3a3bce07a58fbfda39465aa84debd48cede21319a94" & adm_date=="2012-10-16" ~ "2013-08-13",
#rn== 34193
hash_key=="60e3066c438a10246353d3a3bce07a58fbfda39465aa84debd48cede21319a94" & adm_date=="2012-10-16" ~ "2013-08-13",
#rn== 35638
hash_key=="08a5dc9a016c0525d7ceea954a8078391701ea9743b71bc2a012f0949952029f" & adm_date=="2013-01-07" ~ "2013-07-17",
#rn== 36161
hash_key=="71049ebb5d958e0647c01c4398c91ff3e02275f7dc5e2fefee5bc263a7653c96" & adm_date=="2013-01-28" ~ "2013-08-12",
#rn== 36415
hash_key=="52e218f6406835e8624ffe71595152560ec44a02a7580d673019eefa88df7a61" & adm_date=="2013-01-29" ~ "2013-04-02",
#rn== 37116
hash_key=="22c282462adfb8e48b3a6b697d533244c9c656a6b31ff87d0180679d9f5ce98d" & adm_date=="2013-02-08" ~ "2013-08-02",
#rn== 37958
hash_key=="221d71ae6c4dba4aee931b3ee518d47fd3972fed3fbf7f4d44c676bedca786c4" & adm_date=="2013-03-18" ~ "2013-07-10",
#rn== 38907
hash_key=="877ea9b68dde038d9f63d04d4e65d1eb27ac3f46af22e310c7c2114feb7f871b" & adm_date=="2013-04-18" ~ "2013-07-31",
#rn== 38908
hash_key=="14af0ddf318fb49877b16491b0fb7df491d98bd32dd854bdbec526f898dd9946" & adm_date=="2013-04-18" ~ "2013-06-17",
#rn== 38909
hash_key=="243a1044f746ae87432532552b4b93b6978fb3b18fa3a4305a11b2af698eb013" & adm_date=="2013-04-16" ~ "2013-07-27",
#rn== 39617
hash_key=="0e729e637c95d5d4486a7f822d14f0f1925ac358fff61d9bba9d7407b8e9abe7" & adm_date=="2013-04-29" ~ "2013-07-25",
#rn== 39618
hash_key=="289a7b6c884980dc60c9171bb05939bacf18a62551ebda723af75cbfc8308db9" & adm_date=="2013-05-08" ~ "2013-07-14",
#rn== 39620
hash_key=="cde086d548022a94e623bfc3d6b34202b28141ed2134ba35425ce4807e75f2fb" & adm_date=="2013-04-29" ~ "2013-07-02",
#rn== 40045
hash_key=="10fc40384411161967b222bf530a0378e0ae585bd69370d57d9c4fb49a1a34c3" & adm_date=="2013-05-22" ~ "2013-08-02",
#rn== 40293
hash_key=="67353760ae53ad8963176af0ec6cab9c4bdad13b9e53058e68e53f80b409b224" & adm_date=="2013-05-29" ~ "2013-08-07",
#rn== 40599
hash_key=="3ce639d4d0330242d1f7c1e6496e834ad3fa2b41bef89b09bc373e9dede8c981" & adm_date=="2013-05-02" ~ "2013-07-03",
#rn== 41114
hash_key=="5e6d9dcec9e717d4536f7cfa5cc0f713e7c2c7933058aeb9a37fec0a24da5151" & adm_date=="2013-06-06" ~ "2013-07-31",
#rn== 41117
hash_key=="e01e3218ba73e9d26178e7a6aceb86357695bc88117f1d7b89c8adbf55210528" & adm_date=="2013-06-05" ~ "2013-06-27",
#rn== 42456
hash_key=="421abbc2c85687aa87adec1c3146debf5ddea3ea71f65d708c2cf4d4dde86e38" & adm_date=="2013-07-02" ~ "2013-07-08",
#rn== 42633
hash_key=="567f1fd735550a9bc1a2ea8a838d87b69369caa106c2d0cd0a1b38581d09919f" & adm_date=="2013-07-09" ~ "2013-08-16",
#rn== 42634
hash_key=="7f259b5289b209cc669db813abfcd14519a21c4f69aaeb0190f094c61a52afad" & adm_date=="2013-06-28" ~ "2013-07-09",
#rn== 42854
hash_key=="49cca05a51baac5c836a053eac96674c775e2d7164209a04f09f8da34952b789" & adm_date=="2013-07-02" ~ "2013-08-02",
#rn== 43076
hash_key=="6adbbaff91e32138777abcf66a161d953722255c88368f9a5877d1ddfa48decd" & adm_date=="2013-08-06" ~ "2013-08-20",
#rn== 43181
hash_key=="02c866ee44e5a3a310cf18728753e3a4c3751d4ea4d61edc22d78606cde0fcc8" & adm_date=="2013-08-01" ~ "2013-08-16",
#rn== 43182
hash_key=="506be60207917af56fa39175f11ee5b3b874c0883245e37d0b2a79e0b24f08ad" & adm_date=="2013-08-01" ~ "2013-08-22",
TRUE ~ as.character(discharge_date)
)) |>
tidytable::mutate(disch_date= readr::parse_date(disch_date, format="%Y-%m-%d"))|>
tidytable::mutate(OBS = tidytable::case_when(
hash_key == "c4795829b6ea9cfc50b988c85deb391fa041d99a0ebca6b68a1378f37e3eb420" & adm_date == "2009-06-30" ~ glue::glue("{OBS}; 0.c.Modify discharge date"),
hash_key == "23874d59570adaac6690c85481b869570c10c2f8931fc20636037cdff04af067" & adm_date == "2008-07-02" ~ glue::glue("{OBS}; 0.c.Modify discharge date"),
hash_key == "5a16413f76625a09585c89fd3ea4fb05d1ea5cbfbc18247a9fb6e7e21534562d" & adm_date == "2008-07-23" ~ glue::glue("{OBS}; 0.c.Modify discharge date"),
hash_key == "11b143acdce4bf1d3a72acd4a703ea8c38543fd02585b4f3b0433e227929ed3c" & adm_date == "2008-03-04" ~ glue::glue("{OBS}; 0.c.Modify discharge date"),
hash_key == "986ded00e6ca834805a169ed528655e22f819bf5104d1729b2e1453f20f38065" & adm_date == "2008-12-05" ~ glue::glue("{OBS}; 0.c.Modify discharge date"),
hash_key == "d402a1e13f25b2411ca346b0dc84b9fffa45887e628abf09262777b6deae85aa" & adm_date == "2009-06-09" ~ glue::glue("{OBS}; 0.c.Modify discharge date"),
hash_key == "0d248b372c7224ae2cc1cabb750d6201150175b5d65ec0397ff2127d32b6b675" & adm_date == "2009-02-05" ~ glue::glue("{OBS}; 0.c.Modify discharge date"),
hash_key == "6eb67e1ead556eb1dbd21951747440057a17a872b33b468a37c9bf781219cef8" & adm_date == "2009-10-07" ~ glue::glue("{OBS}; 0.c.Modify discharge date"),
hash_key == "e0acff1477306ee93abfca7e251cc6d23db916b390a9fe506fbbefc371ce1d43" & adm_date == "2009-12-07" ~ glue::glue("{OBS}; 0.c.Modify discharge date"),
hash_key == "eb13b44585501a35df9ce6d262ca6e69e4aa34063af219e19cc95e7609e38cdf" & adm_date == "2010-04-26" ~ glue::glue("{OBS}; 0.c.Modify discharge date"),
hash_key == "058e8b2c02f98d488a78d78d80435e516c6628cd7edb87ecaf9f8c981d9614ba" & adm_date == "2010-05-03" ~ glue::glue("{OBS}; 0.c.Modify discharge date"),
hash_key == "4d42363412d6a435dd2762bbee7f9b4fe4117ff4c94d55e10472342156238ccb" & adm_date == "2010-06-17" ~ glue::glue("{OBS}; 0.c.Modify discharge date"),
hash_key == "228fc5b7b88c5f544f71f9ecfbad4d1750470b717f869a7aa9f01b0169a5d890" & adm_date == "2010-07-01" ~ glue::glue("{OBS}; 0.c.Modify discharge date"),
hash_key == "7ebe4155bb7741beef0f30ce47ecbc735bd1f7137d22e81ba21d5f12f8398fa2" & adm_date == "2010-10-04" ~ glue::glue("{OBS}; 0.c.Modify discharge date"),
hash_key == "93478aa27b121dbad91cb8e36ef60caa42fce6ca5b99478a77e9b8478df600f3" & adm_date == "2010-11-23" ~ glue::glue("{OBS}; 0.c.Modify discharge date"),
hash_key == "6500209f17b52ab4e00a140f7c8f0a10d9b073f81ac9443203f0a1b84c4dc1e8" & adm_date == "2010-11-25" ~ glue::glue("{OBS}; 0.c.Modify discharge date"),
hash_key == "4d6e97bfc2aeb15a8c6457ad1c84335de48b5456177b9749159ec2974537634f" & adm_date == "2010-11-25" ~ glue::glue("{OBS}; 0.c.Modify discharge date"),
hash_key == "1d5a63a966cea8241228f0057a38ef4e63e0fb353dda174dc95d4393e4cdcefa" & adm_date == "2010-12-02" ~ glue::glue("{OBS}; 0.c.Modify discharge date"),
hash_key == "c75bb8c43963dbad7a1b311497073a58b0e97bb82c5c63a4bc7ae4d1c9014592" & adm_date == "2011-01-13" ~ glue::glue("{OBS}; 0.c.Modify discharge date"),
hash_key == "f40999d751e9eb84f5ed6d832d96a1de872599c181e28dd420507c58d7464ccf" & adm_date == "2011-02-08" ~ glue::glue("{OBS}; 0.c.Modify discharge date"),
hash_key == "dbe7ddec7591332da15c3c4a1d4a2a1559d455a67b6c31a390ea546ea259c045" & adm_date == "2011-02-10" ~ glue::glue("{OBS}; 0.c.Modify discharge date"),
hash_key == "05ff2bf96ef3a294c09b39cf91c19f7a74b080487f13f62c449812f14cefff37" & adm_date == "2011-03-22" ~ glue::glue("{OBS}; 0.c.Modify discharge date"),
hash_key == "bdf81829448433489a21d8ac17de96f3765707798d8e2beb7653414f43f272aa" & adm_date == "2011-04-15" ~ glue::glue("{OBS}; 0.c.Modify discharge date"),
hash_key == "0bd45263c5217ae4324c23ca4bfec945d4100276fcac4e3e66ad5b6f5341d3fd" & adm_date == "2011-05-20" ~ glue::glue("{OBS}; 0.c.Modify discharge date"),
hash_key == "d6d0aaa21c50981871615a6b8886d1f69a3d0f125165f63f6a1c54729be5eea2" & adm_date == "2011-05-23" ~ glue::glue("{OBS}; 0.c.Modify discharge date"),
hash_key == "4728851a593a1490d73682e45945fe0f253d0f18dfc12aa1d2d21deef206c39c" & adm_date == "2011-04-18" ~ glue::glue("{OBS}; 0.c.Modify discharge date"),
hash_key == "caafb47faaab3c9637821a50ce4dcef33b8e3a9fc275f0ef76f0c93681eb15ba" & adm_date == "2011-06-06" ~ glue::glue("{OBS}; 0.c.Modify discharge date"),
hash_key == "18096679bef8db59dbd0ca3be91fa36d7d9dcbbf06b85be2662f410d0146d1a2" & adm_date == "2011-06-17" ~ glue::glue("{OBS}; 0.c.Modify discharge date"),
hash_key == "40d3ff594c6c3ddd96e37e5e53fbd22030916a99a4f04cf6283ad188058f2a5b" & adm_date == "2011-06-23" ~ glue::glue("{OBS}; 0.c.Modify discharge date"),
hash_key == "667766680894eb203756044682c8445365bb0a831012ec49341b080390133d5d" & adm_date == "2011-06-20" ~ glue::glue("{OBS}; 0.c.Modify discharge date"),
hash_key == "60e3066c438a10246353d3a3bce07a58fbfda39465aa84debd48cede21319a94" & adm_date == "2012-10-16" ~ glue::glue("{OBS}; 0.c.Modify discharge date"),
hash_key == "08a5dc9a016c0525d7ceea954a8078391701ea9743b71bc2a012f0949952029f" & adm_date == "2013-01-07" ~ glue::glue("{OBS}; 0.c.Modify discharge date"),
hash_key == "71049ebb5d958e0647c01c4398c91ff3e02275f7dc5e2fefee5bc263a7653c96" & adm_date == "2013-01-28" ~ glue::glue("{OBS}; 0.c.Modify discharge date"),
hash_key == "52e218f6406835e8624ffe71595152560ec44a02a7580d673019eefa88df7a61" & adm_date == "2013-01-29" ~ glue::glue("{OBS}; 0.c.Modify discharge date"),
hash_key == "22c282462adfb8e48b3a6b697d533244c9c656a6b31ff87d0180679d9f5ce98d" & adm_date == "2013-02-08" ~ glue::glue("{OBS}; 0.c.Modify discharge date"),
hash_key == "221d71ae6c4dba4aee931b3ee518d47fd3972fed3fbf7f4d44c676bedca786c4" & adm_date == "2013-03-18" ~ glue::glue("{OBS}; 0.c.Modify discharge date"),
hash_key == "877ea9b68dde038d9f63d04d4e65d1eb27ac3f46af22e310c7c2114feb7f871b" & adm_date == "2013-04-18" ~ glue::glue("{OBS}; 0.c.Modify discharge date"),
hash_key == "14af0ddf318fb49877b16491b0fb7df491d98bd32dd854bdbec526f898dd9946" & adm_date == "2013-04-18" ~ glue::glue("{OBS}; 0.c.Modify discharge date"),
hash_key == "243a1044f746ae87432532552b4b93b6978fb3b18fa3a4305a11b2af698eb013" & adm_date == "2013-04-16" ~ glue::glue("{OBS}; 0.c.Modify discharge date"),
hash_key == "0e729e637c95d5d4486a7f822d14f0f1925ac358fff61d9bba9d7407b8e9abe7" & adm_date == "2013-04-29" ~ glue::glue("{OBS}; 0.c.Modify discharge date"),
hash_key == "289a7b6c884980dc60c9171bb05939bacf18a62551ebda723af75cbfc8308db9" & adm_date == "2013-05-08" ~ glue::glue("{OBS}; 0.c.Modify discharge date"),
hash_key == "cde086d548022a94e623bfc3d6b34202b28141ed2134ba35425ce4807e75f2fb" & adm_date == "2013-04-29" ~ glue::glue("{OBS}; 0.c.Modify discharge date"),
hash_key == "10fc40384411161967b222bf530a0378e0ae585bd69370d57d9c4fb49a1a34c3" & adm_date == "2013-05-22" ~ glue::glue("{OBS}; 0.c.Modify discharge date"),
hash_key == "67353760ae53ad8963176af0ec6cab9c4bdad13b9e53058e68e53f80b409b224" & adm_date == "2013-05-29" ~ glue::glue("{OBS}; 0.c.Modify discharge date"),
hash_key == "3ce639d4d0330242d1f7c1e6496e834ad3fa2b41bef89b09bc373e9dede8c981" & adm_date == "2013-05-02" ~ glue::glue("{OBS}; 0.c.Modify discharge date"),
hash_key == "5e6d9dcec9e717d4536f7cfa5cc0f713e7c2c7933058aeb9a37fec0a24da5151" & adm_date == "2013-06-06" ~ glue::glue("{OBS}; 0.c.Modify discharge date"),
hash_key == "e01e3218ba73e9d26178e7a6aceb86357695bc88117f1d7b89c8adbf55210528" & adm_date == "2013-06-05" ~ glue::glue("{OBS}; 0.c.Modify discharge date"),
hash_key == "421abbc2c85687aa87adec1c3146debf5ddea3ea71f65d708c2cf4d4dde86e38" & adm_date == "2013-07-02" ~ glue::glue("{OBS}; 0.c.Modify discharge date"),
hash_key == "567f1fd735550a9bc1a2ea8a838d87b69369caa106c2d0cd0a1b38581d09919f" & adm_date == "2013-07-09" ~ glue::glue("{OBS}; 0.c.Modify discharge date"),
hash_key == "7f259b5289b209cc669db813abfcd14519a21c4f69aaeb0190f094c61a52afad" & adm_date == "2013-06-28" ~ glue::glue("{OBS}; 0.c.Modify discharge date"),
hash_key == "49cca05a51baac5c836a053eac96674c775e2d7164209a04f09f8da34952b789" & adm_date == "2013-07-02" ~ glue::glue("{OBS}; 0.c.Modify discharge date"),
hash_key == "6adbbaff91e32138777abcf66a161d953722255c88368f9a5877d1ddfa48decd" & adm_date == "2013-08-06" ~ glue::glue("{OBS}; 0.c.Modify discharge date"),
hash_key == "02c866ee44e5a3a310cf18728753e3a4c3751d4ea4d61edc22d78606cde0fcc8" & adm_date == "2013-08-01" ~ glue::glue("{OBS}; 0.c.Modify discharge date"),
hash_key == "506be60207917af56fa39175f11ee5b3b874c0883245e37d0b2a79e0b24f08ad" & adm_date == "2013-08-01" ~ glue::glue("{OBS}; 0.c.Modify discharge date"),
TRUE ~ OBS))
#40d3ff594c6c3ddd96e37e5e53fbd22030916a99a4f04cf6283ad188058f2a5b
#2019-07-01
#60e3066c438a10246353d3a3bce07a58fbfda39465aa84debd48cede21319a94
#2019-07-01
#fecha_egresode_tratamiento discharge_date
#
invisible("store problematic parsing")
problems_disch_date <- readr::problems(SISTRAT23_c1_2010_2022_df_prev00$disch_date)
invisible("Eliminate column with previous transformations of discharge date")
SISTRAT23_c1_2010_2022_df_prev00$discharge_date<- NULL
if(nrow(problems_disch_date) >0){message("There are still problems with the discharge date")}
invisible("Eliminate spaces and ; sign if starts with one")
SISTRAT23_c1_2010_2022_df_prev00$OBS <- sub("^;\\s*", "", SISTRAT23_c1_2010_2022_df_prev00$OBS)The HASH key 60e3066c438a10246353d3a3bce07a58fbfda39465aa84debd48cede21319a94 has two observations with the same admission date but a missing fecha_egreso_de_tratamiento. However, the observation with a more recent retrieval date includes a discharge date. We kept this information as the valid one.
1. Drop duplicated entries
Many treatments span more than one year, meaning entries from different yearly datasets may correspond to the same treatment. We detected duplicate rows across nearly every variable, with the exception of the entry’s row number in the consolidated dataset and the dataset’s retrieval year.
Code
#create vector with variable names /wo noninformative ones
names_c1 <- setdiff(names(SISTRAT23_c1_2010_2022_df_prev00), c("codigo_identificacion","TABLE", "TABLE_rec","rn"))
#, "fecha_egresode_tratamiento","disch_date"
#Group by duplicated rows
tidytable::as_tidytable(SISTRAT23_c1_2010_2022_df_prev00)[, perfect_dup := .N, by = names_c1] %>%
assign(x="SISTRAT23_c1_2010_2022_df_prev1",.,envir = .GlobalEnv)
#summarise duplicates and times
SISTRAT23_c1_2010_2022_df_prev1%>%
tidytable::arrange(hash_key, adm_date, desc(TABLE_rec)) %>%
tidytable::group_by(perfect_dup) %>%
tidytable::summarise(n=n()) %>%
tidytable::rename("Times in dataset"= perfect_dup, "Number of rows"=`n`) %>%
as.data.frame() %>% #pander needs this
pander::pander(style = 'rmarkdown', split.tables = Inf, caption = "Table 1. Duplicated cases in almost every variable (excluding TABLE, SENDA ID and row number)")
# 1 1 104454
# 2 2 98114
# 3 3 16947
# 4 4 2884
# 5 5 470
# 6 6 156
# 7 7 28
# 8 8 8| Times in dataset | Number of rows |
|---|---|
| 1 | 104,373 |
| 2 | 98,082 |
| 3 | 16,944 |
| 4 | 2,884 |
| 5 | 470 |
| 6 | 156 |
| 7 | 28 |
| 8 | 8 |
Code
tidytable::as_tidytable(SISTRAT23_c1_2010_2022_df_prev1)|>
#order by retrieval date of each observation
tidytable::arrange(desc(TABLE_rec))|>
#add a column with details of the process of cleaning and formatting
tidytable::mutate(OBS=case_when(perfect_dup>1 ~ "1.1. Duplicated Cases in Almost Every Variable", TRUE ~ ""))|>
#select observations which are different in al relevant data
tidytable::distinct(all_of(names_c1), .keep_all = TRUE)|>
#order data by hash key, admission date and database of yearly retrieval dates.
tidytable::arrange(hash_key, adm_date, desc(TABLE), desc(disch_date))|>
(\(df) {
print(message(paste0("Deduplicated in almost every variable, Entries: ", nrow(df))))
print(message(paste0("Deduplicated in almost every variable, RUNs: ", tidytable::distinct(df, hash_key)|> nrow())))
df
})()|>
# Assign to global environment
assign(x="SISTRAT23_c1_2010_2022_df_prev1b", ., envir = .GlobalEnv)NULL
NULL
Since these duplicated rows contain identical values, no additional information is lost by deleting them. Thus, we selected only 159,908 rows from the original 223,061 observations.
We still needed to identify which of the repeated treatments contained the most recent information for each specific treatment. Therefore, we removed duplicated rows but retained entries with the greater amount of days of treatment or from a more recent yearly dataset. However, handling cases with negative days of treatment presented a challenge. Table below displays HASH keys with entries showing negative days of treatment. In these cases, it was essential to clarify some dates to prevent overlap between treatments. As shown in this Table, some entries can be replaced with events having similar dates, while others were marked as missing and will be imputed once the dataset is normalized. .
1.b. Most frequent episodes by patients
Before analyzing these matches, we also need to ask ourselves if there is any HASH Key that might not be referring to a single patient. To do this, we examine a table showing the count of occurrences in the database.
Code
SISTRAT23_c1_2010_2022_df_prev1b|>
janitor::tabyl(hash_key)|>
arrange(desc(n))|>
head(10)|>
data.frame()|>
tidytable::mutate(percent=scales::percent(percent, accuracy=0.01))|>
knitr::kable('markdown', caption = "Table 2. Most frequent HASHs")| hash_key | n | percent |
|---|---|---|
| 265141d6791da3b6b3f24385df58fff31734290e1e156271cec46b48151dd9ab | 15 | 0.01% |
| 96cc814066189b3be9b3e1a35b3b18ebb58f2252b5e91b22a5347d098450f364 | 15 | 0.01% |
| 06436b79c8d835b3542ad3a09753bac01befad1d48a18047dc4358a6665b87b7 | 13 | 0.01% |
| 889f3c40a42d555db3ce0fec762972f1466c4fb772ef02e44ca579cd2dd6e249 | 13 | 0.01% |
| 9550d49263e6f07553e11bb60f196e9d971081ceea9b200cb8d49bc9f3b8acf5 | 13 | 0.01% |
| cac708026974039c22857ef3b94355cc9aacef0af45d1bc79c5d1ad363a2fba3 | 13 | 0.01% |
| f12f949b082089d80fa15a9992ee7eb68dd83afd5d7ca506ad6ea7a34496ad56 | 13 | 0.01% |
| 2c2ff89b5d7804cce715ac9ba639f76a2b2ffc2047e50dd4d9e0d87a0d25efae | 12 | 0.01% |
| 363d31da8545038e8b644d1fe217d8dfff7dbb211d5009509212ddeaf3eb5875 | 12 | 0.01% |
| 6a265e4b241613507ab42ecac68f50f3d38dc4026db0c36571c0342785e9e243 | 12 | 0.01% |
Given the guidance from SENDA professionals, the duplicated rows in Table 4 actually correspond to different official IDs (RUNs). This indicates that distinct HASH values represent unique users, suggesting that HASH is a more reliable identifier than SENDA’s ID. Therefore, we skipped the step of identifying HASH keys and SENDA IDs.
1.c. Focus on Duplicated Cases and Dates of Admission
We needed to differentiate between duplicated cases and unique admissions from 2010 to 2022 to focus on treatment date ranges within each patient.
The first step consist in selecting columns to count for data completeness checks. We identified numeric and categorical columns, but excluding specific columns created in the process (e.g., rn, perfect_dup, OBS and hash_key).
The second step consist in preparing the hash_adm_date dataset of duplicated entries with the same hash_key and adm_date. We also converted adm_date and disch_date to numeric values (adm_date_num and disch_date_num). Additionally, calculating dit as the difference between discharge and admission dates, representing the treatment duration.
The third step were created to identify entries sharing the same admission date for each patient and counting how many entries exist per combination of hash_key and adm_date_num, storing this as ntot_hash_adm, and filter for groups with more than one entry, as these indicate potential duplicates.
In the fourth step, we counted missing values in dit (days in treatment) within each group to track incomplete records. We also counted negative values in dit to flag data inconsistencies (e.g., discharge occurring before admission). Additionally, we added a sequential number for each row within the group (n_ord_hash_adm_date) to maintain the record order. Finally, we counted the unique treatment plans and programs in each group, storing these as ndis_tipo_plan and ndis_tipo_prog.
The fifth step involved calculating data completeness metrics for each entry by checking for missing values across the selected numeric and character columns, recorded in n_col_miss. Similarly, we calculated n_col_empty to count empty columns in character fields (e.g., empty strings). We then ranked entries to prioritize those with more complete data.
The final step involved filling missing dit values with a default duration. This was based on a fixed reference date (“2023-04-28” in numeric form: as.numeric(as.Date("2023-04-28")) minus adm_date_num, ensuring all rows have consistent values for analysis.
Since some records showed inconsistencies in age, the average age (avg_age) within observations sharing the same hash and admission date was calculated to facilitate subsequent comparisons.
Code
# Select numeric columns
num_columns <- sapply(SISTRAT23_c1_2010_2022_df_prev1b, is.numeric)
num_column_names <- names(SISTRAT23_c1_2010_2022_df_prev1b)[num_columns]
num_column_names <- setdiff(num_column_names, c("rn", "perfect_dup"))
# Select character columns
char_columns <- sapply(SISTRAT23_c1_2010_2022_df_prev1b, is.character)
char_column_names <- names(SISTRAT23_c1_2010_2022_df_prev1b)[char_columns]
char_column_names <- setdiff(char_column_names, c("OBS", "TABLE", "TABLE_rec", "hash_key", "codigo_identificacion"))
invisible("Get a database of observations with the same HASH and admission date")
hash_adm_date <-
SISTRAT23_c1_2010_2022_df_prev1b|>
tidytable::mutate(adm_date_num=as.numeric(as.Date(adm_date)),
disch_date_num= as.numeric(as.Date(disch_date)),
dit= disch_date_num-adm_date_num)|>
#add a counter of distinct combinations of hash_key and discharge dates
tidytable::add_count(hash_key, adm_date_num, name = "ntot_hash_adm")|>
tidytable::filter(ntot_hash_adm>1)|>
tidytable::group_by(hash_key, adm_date_num)|>
tidytable::mutate(
#get the number of distinct centers
ndis_tipo_plan= tidytable::n_distinct(tipo_de_plan),
ndis_tipo_prog= tidytable::n_distinct(tipo_de_programa),
#if there are different programs or tr. plans, 1, else, 0
s1cab_plan_prog = tidytable::case_when(ndis_tipo_plan == 1 & ndis_tipo_prog == 1 ~ 0, ndis_tipo_plan > 1 | ndis_tipo_prog > 1 ~ 1, TRUE ~ 0))|>
#filter days in treatment with missing values
#tidytable::filter(is.na(dit)) |>
tidytable::ungroup() |>
tidytable::mutate_rowwise(
n_col_miss = sum(map_lgl(c_across(any_of(num_column_names)), ~ is.na(.x)),na.rm=T) + sum(map_lgl(c_across(where(is.character)), ~ is.na(.x)),na.rm=T),#,
n_col_empty = sum(sapply(c_across(any_of(char_column_names)), function(x) sum(is.na(x) | nchar(x) < 2)), na.rm = TRUE))|>
tidytable::mutate(birth_date_num= as.numeric(birth_date)) |>
tidytable::select(rn, TABLE_rec, hash_key, adm_date, adm_date_num, disch_date, tipo_de_plan, tipo_de_programa, id_centro, edad, motivo_de_egreso, ntot_hash_adm, dit, senda, s1cab_plan_prog, ndis_tipo_plan, ndis_tipo_prog, n_col_miss, n_col_empty, birth_date_num, edad_inicio_consumo, edad_inicio_sustancia_principal)
#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:
#:#:#:#:#:#:#:#:#:
invisible("Export database to explore it")
wdpath<-
paste0(gsub("/cons","",gsub("cons","",paste0(getwd(),"/cons"))))
wdpath
envpath<- if(regmatches(wdpath, regexpr("[A-Za-z]+", wdpath))=="G"){"G:/Mi unidad/Alvacast/SISTRAT 2023/"}else{"E:/Mi unidad/Alvacast/SISTRAT 2023/"}
envpath
rio::export(file=paste0(wdpath,"cons/_out/db1c_hash_adm_date.xlsx"),hash_adm_date)
#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:
#:#:#:#:#:#:#:#:#:
hash_adm_date|>
data.frame()|>
tidytable::mutate(adm_year= clock::get_year(adm_date))|>
tidytable::group_by(hash_key, adm_date)|>
tidytable::mutate(
count_miss_disch_date = sum(is.na(disch_date), na.rm=T))|>
tidytable::slice(1)|>
tidytable::ungroup()|>
tidytable::group_by(adm_year, ndis_tipo_plan, ndis_tipo_prog, count_miss_disch_date, s1cab_plan_prog)|>
tidytable::summarise(Freq=n())|>
tidytable::ungroup()|>
tidytable::mutate(count_miss_disch_date = ifelse(as.numeric(count_miss_disch_date)>1, 1, 0))|>
tidytable::group_by(adm_year, s1cab_plan_prog, count_miss_disch_date)|>
tidytable::summarise(Freq= sum(Freq, na.rm=T))|>
tidytable::ungroup()|>
tidyr::pivot_wider(
names_from = c("s1cab_plan_prog", "count_miss_disch_date"),
values_from = "Freq",
names_glue = "plan_prog_{s1cab_plan_prog}_miss_{count_miss_disch_date}",
values_fill = 0)|>
knitr::kable(style = 'markdown',
col.names= c("Admission year", "Same programs/plans, no miss disch date", "Distinct programs/plans, no miss disch date", "Same programs/plans, miss disch date", "Distinct programs/plans, miss disch date"),
caption = paste0("Table 3. Obs. w/ same HASH & admission date, by distinct program/plan (1) & missing values in discharge date(2) (total groups=", tidytable::group_by(hash_adm_date, hash_key, adm_date)|> tidytable::slice(1)|>nrow(),")"))#8442[1] "G:/My Drive/Alvacast/SISTRAT 2023//"
[1] "G:/Mi unidad/Alvacast/SISTRAT 2023/"
| Admission year | Same programs/plans, no miss disch date | Distinct programs/plans, no miss disch date | Same programs/plans, miss disch date | Distinct programs/plans, miss disch date |
|---|---|---|---|---|
| 2007 | 1 | 0 | 0 | 0 |
| 2008 | 3 | 6 | 0 | 0 |
| 2009 | 15 | 13 | 0 | 0 |
| 2010 | 32 | 46 | 0 | 0 |
| 2011 | 42 | 55 | 0 | 0 |
| 2012 | 32 | 47 | 0 | 0 |
| 2013 | 37 | 25 | 0 | 0 |
| 2014 | 47 | 24 | 2 | 0 |
| 2015 | 22 | 8 | 17 | 0 |
| 2016 | 9 | 0 | 29 | 0 |
| 2017 | 28 | 1 | 176 | 0 |
| 2018 | 325 | 6 | 974 | 8 |
| 2019 | 6253 | 85 | 64 | 0 |
| 2020 | 3 | 1 | 0 | 0 |
| 2021 | 5 | 0 | 0 | 0 |
| 2022 | 1 | 0 | 0 | 0 |
As seen in the table above, admission years between 2021 and 2022 did not show any changes in plan or program among entries with the same HASH and admission date. Therefore, we selected the following criteria for this group of observations, which we have termed “referral careers”: entries with the same admission date but with changes in plan or program and different discharge dates.
Code
invisible("plan_prog_1_miss_1 hay como 200 casos con fecha perdidas")
paste0("Duplicated HASH & adm date\n(n= ",formatC(nrow(hash_adm_date), big.mark=","),"\np= ",formatC(nrow(distinct(hash_adm_date,hash_key,adm_date)), big.mark=","),"\np= ",formatC(length(unique(hash_adm_date$hash_key)), big.mark=","),")")
#[1] "Duplicated HASH & adm date\n(n= 18,155\np= 8,442\np= 8,374)"
paste0("Duplicated HASH & adm date, changes in plan or program\n(n= ",formatC(nrow(tidytable::filter(hash_adm_date, s1cab_plan_prog==1)), big.mark=","),"\np= ",formatC(nrow(distinct(tidytable::filter(hash_adm_date, s1cab_plan_prog==1),hash_key,adm_date)), big.mark=","),"\np= ",formatC(length(unique(tidytable::filter(hash_adm_date, s1cab_plan_prog==1)$hash_key)), big.mark=","),")")
#[1] "Duplicated HASH & adm date, changes in plan or program\n(n= 698\np= 325\np= 325)"
paste0("Duplicated HASH & adm date, no changes in plan or program\n(n= ",formatC(nrow(tidytable::filter(hash_adm_date, s1cab_plan_prog==0)), big.mark=","),"\np= ",formatC(nrow(distinct(tidytable::filter(hash_adm_date, s1cab_plan_prog==0),hash_key,adm_date)), big.mark=","),"\np= ",formatC(length(unique(tidytable::filter(hash_adm_date, s1cab_plan_prog==0)$hash_key)), big.mark=","),")")
#[1] "Duplicated HASH & adm date, no changes in plan or program\n(n= 17,457\np= 8,117\np= 8,055)"[1] "Duplicated HASH & adm date\n(n= 18,155\np= 8,442\np= 8,374)"
[1] "Duplicated HASH & adm date, changes in plan or program\n(n= 698\np= 325\np= 325)"
[1] "Duplicated HASH & adm date, no changes in plan or program\n(n= 17,457\np= 8,117\np= 8,055)"
The following flowchart represents a logical decision-making process for handling data entries with multiple observations that share the same unique identifiers, specifically “HASH” and “admission date.” The aim is to systematically evaluate each observation and retain only the most relevant one based on specific criteria.
Code
#https://bergant.github.io/bpmn/
wdpath<-
paste0(gsub("/cons","",gsub("cons","",paste0(getwd(),"/cons"))))
wdpath
envpath<- if(regmatches(wdpath, regexpr("[A-Za-z]+", wdpath))=="G"){"G:/Mi unidad/Alvacast/SISTRAT 2023/"}else{"E:/Mi unidad/Alvacast/SISTRAT 2023/"}
envpath
bpmn::bpmn(paste0(wdpath, "cons/_input/diagram_dup_hash_adm_date.bpmn"))[1] "G:/My Drive/Alvacast/SISTRAT 2023//"
[1] "G:/Mi unidad/Alvacast/SISTRAT 2023/"
There were numerous internal referrals that did not involve changes to the treatment plan or program characteristics. Additionally, while we were uncertain about prioritizing more days in treatment over the completeness of information (non-missing columns) — criteria that can be further examined in explore_1ca2_lower_dit_but_more_data.R) — we proceeded with this prioritization nonetheless.
Code
invisible("==================================================================")
invisible("Separate according to possible presence of referrals")
hash_adm_date_1ca<-
hash_adm_date |>
tidytable::filter(s1cab_plan_prog==0) |>
#there is variation in age records, so we kept a mean to keep a reference
tidytable::mutate(avg_age= mean(edad, na.rm=T))|>
tidytable::mutate(avg_birth_date_num= mean(birth_date_num, na.rm=T))|>
tidytable::mutate(avg_onset_age= mean(edad_inicio_consumo, na.rm=T))|>
tidytable::mutate(avg_primary_sub_onset_age= mean(edad_inicio_sustancia_principal, na.rm=T)) |>
tidytable::mutate(concat= paste0(hash_key, "_",adm_date))
hash_adm_date_1cb<-
hash_adm_date |>
tidytable::filter(s1cab_plan_prog==1)|>
#there is variation in age records, so we kept a mean to keep a reference
tidytable::mutate(avg_age= mean(edad, na.rm=T))|>
tidytable::mutate(avg_birth_date_num= mean(birth_date_num, na.rm=T))|>
tidytable::mutate(avg_onset_age= mean(edad_inicio_consumo, na.rm=T))|>
tidytable::mutate(avg_primary_sub_onset_age= mean(edad_inicio_sustancia_principal, na.rm=T)) |>
tidytable::mutate(concat= paste0(hash_key, "_",adm_date))
#:#:#:#:#:#:#:#:#:#:#:#:#:#:
#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:
#:#:#:#:#:#:#:#:#:#:#:#:#:#:
invisible("==================================================================")
invisible("If the process were truly sequential, shouldn't grouped variables be handled before discarding cases?; we defined objects sequentially")
disc_1ca1<-
hash_adm_date_1ca|>
tidytable::group_by(hash_key, adm_date_num)|>
tidytable::mutate(count_neg_dit = sum(dit<0, na.rm=T)) |>
tidytable::ungroup() |>
tidytable::mutate(
DECISION1= tidytable::case_when(
count_neg_dit>0 & count_neg_dit<ntot_hash_adm & dit<0~ "1c.a.1.cases w/neg days, removed neg. days[*]",
count_neg_dit>0 & count_neg_dit<ntot_hash_adm & dit>=0~ "1c.a.1.cases w/neg days, removed neg. days", T~""))|>
tidytable::filter(DECISION1 == "1c.a.1.cases w/neg days but w/ positive, removed neg. days[*]")|>
pull(rn)
kept_1ca1<-
hash_adm_date_1ca|>
tidytable::group_by(hash_key, adm_date_num)|>
tidytable::mutate(avg_age= mean(edad, na.rm=T))|>
tidytable::mutate(count_neg_dit = sum(dit<0, na.rm=T)) |>
tidytable::ungroup() |>
tidytable::mutate(
DECISION1= tidytable::case_when(
count_neg_dit>0 & count_neg_dit<ntot_hash_adm & dit<0~ "1c.a.1.cases w/neg days, removed neg. days[*]",
count_neg_dit>0 & count_neg_dit<ntot_hash_adm & dit>=0~ "1c.a.1.cases w/neg days, removed neg. days", T~""))|>
tidytable::filter(!DECISION1 == "1c.a.1.cases w/neg days, removed neg. days[*]")
message(paste0("1.Groups that still have more than one entry: ",kept_1ca1 |> tidytable::group_by(hash_key, adm_date_num) |>
tidytable::summarise(n= n()) |>
tidytable::filter(n>1) |> nrow()))Code
message(paste0("1.Cases that still have more than one entry: ",
tidytable::group_by(kept_1ca1, hash_key, adm_date_num) |>
tidytable::summarise(n= n()) |>
tidytable::filter(n>1) |>
tidytable::ungroup() |>
summarise(sum=sum(n))))Code
#1.Groups that still have more than one entry: 8115
#1.Cases that still have more than one entry: 17453
disc_1ca2 <-
kept_1ca1 |>
tidytable::group_by(hash_key, adm_date_num)|>
tidytable::mutate(count_miss_dit = sum(is.na(dit), na.rm=T)) |>
tidytable::ungroup() |>
tidytable::mutate(DECISION2= tidytable::case_when(
count_miss_dit>0 & count_miss_dit<ntot_hash_adm & is.na(disch_date)~ "1c.a.2.cases w/missing discharge date, removed missed discharge dates[*]",
count_miss_dit>0 & count_miss_dit<ntot_hash_adm & !is.na(disch_date)~ "1c.a.2.cases w/missing discharge date, removed missed discharge dates", T~"")) |>
tidytable::filter(DECISION2 == "1c.a.2.cases w/missing discharge date, removed missed discharge dates[*]") |>
pull(rn)
kept_1ca2 <-
kept_1ca1 |>
tidytable::group_by(hash_key, adm_date_num)|>
tidytable::mutate(count_miss_dit = sum(is.na(dit), na.rm=T)) |>
tidytable::ungroup() |>
tidytable::mutate(DECISION2= tidytable::case_when(
count_miss_dit>0 & count_miss_dit<ntot_hash_adm & is.na(disch_date)~ "1c.a.2.cases w/missing discharge date, removed missed discharge dates[*]",
count_miss_dit>0 & count_miss_dit<ntot_hash_adm & !is.na(disch_date)~ "1c.a.2.cases w/missing discharge date, removed missed discharge dates", T~"")) |>
tidytable::filter(!DECISION2 == "1c.a.2.cases w/missing discharge date, removed missed discharge dates[*]")
message(paste0("2.Groups that still have more than one entry: ",kept_1ca2 |> tidytable::group_by(hash_key, adm_date_num) |>
tidytable::summarise(n= n()) |>
tidytable::filter(n>1) |> nrow()))Code
message(paste0("2.Cases that still have more than one entry: ",
tidytable::group_by(kept_1ca2, hash_key, adm_date_num) |>
tidytable::summarise(n= n()) |>
tidytable::filter(n>1) |>
tidytable::ungroup() |>
summarise(sum=sum(n))))Code
#2.Groups that still have more than one entry: 1079
#2.Cases that still have more than one entry: 2268
disc_1ca3 <-
kept_1ca2 |>
#get cases w/more than one dit, but not every row is missing
tidytable::group_by(hash_key, adm_date_num)|>
tidytable::mutate(count_miss_dit2 = sum(is.na(dit), na.rm=T),
ndis_dit = tidytable::n_distinct(dit),
ntot_hash_adm2 = n(),
rank_by_dit = min_rank(-dit)) |>
tidytable::ungroup() |>
tidytable::mutate(DECISION3= tidytable::case_when(
ndis_dit>1 & count_miss_dit2<ntot_hash_adm2 & rank_by_dit!=1~ "1c.a.3.cases w/different discharge dates, removed entries w/ lower dit[*]",
ndis_dit>1 & count_miss_dit2<ntot_hash_adm2 & rank_by_dit==1~ "1c.a.3.cases w/different discharge dates, removed entries w/ lower dit",
T~""))|>
tidytable::filter(DECISION3 == "1c.a.3.cases w/different discharge dates, removed entries w/ lower dit[*]") |>
pull(rn)
kept_1ca3 <-
kept_1ca2 |>
#get cases w/more than one dit, but not every row is missing
tidytable::group_by(hash_key, adm_date_num)|>
tidytable::mutate(count_miss_dit2 = sum(is.na(dit), na.rm=T),
ndis_dit = tidytable::n_distinct(dit),
ntot_hash_adm2 = n(),
rank_by_dit = min_rank(-dit)) |>
tidytable::ungroup() |>
tidytable::mutate(DECISION3= tidytable::case_when(
ndis_dit>1 & count_miss_dit2<ntot_hash_adm2 & rank_by_dit!=1~ "1c.a.3.cases w/different discharge dates, removed entries w/ lower dit[*]",
ndis_dit>1 & count_miss_dit2<ntot_hash_adm2 & rank_by_dit==1~ "1c.a.3.cases w/different discharge dates, removed entries w/ lower dit",
T~""))|>
tidytable::filter(!DECISION3 == "1c.a.3.cases w/different discharge dates, removed entries w/ lower dit[*]")
message(paste0("3.Groups that still have more than one entry: ",
kept_1ca3 |> tidytable::group_by(hash_key, adm_date_num) |>
tidytable::summarise(n= n()) |>
tidytable::filter(n>1) |> nrow()))Code
message(paste0("3.Cases that still have more than one entry: ",
tidytable::group_by(kept_1ca3, hash_key, adm_date_num) |>
tidytable::summarise(n= n()) |>
tidytable::filter(n>1) |>
tidytable::ungroup() |>
summarise(sum=sum(n))))Code
#3.Groups that still have more than one entry: 849
#3.Cases that still have more than one entry: 1802
disc_1ca4 <-
kept_1ca3 |>
#filter days in treatment with missing values
#tidytable::filter(is.na(dit)) |>
#rank by missing and empty columns
tidytable::group_by(hash_key, adm_date_num)|>
tidytable::mutate(
rank_by_missing = min_rank(n_col_miss),
rank_by_empty = min_rank(n_col_empty),
ndis_miss_data = tidytable::n_distinct(n_col_miss),
ndis_empty_data = tidytable::n_distinct(n_col_empty)) |>
tidytable::ungroup() |>
tidytable::mutate(DECISION4= tidytable::case_when(
(ndis_miss_data>1 | ndis_empty_data>1) & rank_by_missing!=1~ "1c.a.4.cases w/different amount of missing data, removed entries w/ more missingness[*]",
(ndis_miss_data>1 | ndis_empty_data>1) & rank_by_missing==1~ "1c.a.4.cases w/different amount of missing data, removed entries w/ more missingness",
T~""))|>
tidytable::filter(DECISION4 == "1c.a.4.cases w/different amount of missing data, removed entries w/ more missingness[*]") |>
pull(rn)
kept_1ca4 <-
kept_1ca3 |>
#filter days in treatment with missing values
#tidytable::filter(is.na(dit)) |>
#rank by missing and empty columns
tidytable::group_by(hash_key, adm_date_num)|>
tidytable::mutate(
rank_by_missing = min_rank(n_col_miss),
rank_by_empty = min_rank(n_col_empty),
ndis_miss_data = tidytable::n_distinct(n_col_miss),
ndis_empty_data = tidytable::n_distinct(n_col_empty)) |>
tidytable::ungroup() |>
tidytable::mutate(DECISION4= tidytable::case_when(
(ndis_miss_data>1 | ndis_empty_data>1) & rank_by_missing!=1~ "1c.a.4.cases w/different amount of missing data, removed entries w/ more missingness[*]",
(ndis_miss_data>1 | ndis_empty_data>1) & rank_by_missing==1~ "1c.a.4.cases w/different amount of missing data, removed entries w/ more missingness",
T~""))|>
tidytable::filter(!DECISION4 == "1c.a.4.cases w/different amount of missing data, removed entries w/ more missingness[*]")
message(paste0("4.Groups that still have more than one entry: ",kept_1ca4 |> tidytable::group_by(hash_key, adm_date_num) |>
tidytable::summarise(n= n()) |>
tidytable::filter(n>1) |> nrow()))Code
message(paste0("4.Cases that still have more than one entry: ",
tidytable::group_by(kept_1ca4, hash_key, adm_date_num) |>
tidytable::summarise(n= n()) |>
tidytable::filter(n>1) |>
tidytable::ungroup() |>
summarise(sum=sum(n))))Code
#4.Groups that still have more than one entry: 841
#4.Cases that still have more than one entry: 1785
disc_1ca5 <-
kept_1ca4 |>
tidytable::mutate(TABLE_rec= ifelse(nchar(TABLE_rec) < 5,paste0(TABLE_rec, "0"), as.character(TABLE_rec))) |>
tidytable::mutate(TABLE_rec2= readr::parse_number(TABLE_rec)/10)|>
# Sort by hash and admission date from most recent to oldest, with the retrieval year of the database
# in descending order (starting with the most recent) and discharge date in descending order (most recent in the first row).
tidytable::arrange(hash_key, -adm_date_num, -TABLE_rec2, -disch_date)|>
tidytable::group_by(hash_key, adm_date_num)|>
tidytable::mutate(first_miss = row_number() == 1 & is.na(disch_date))|>
tidytable::mutate(
cnt_first_miss = sum(first_miss==TRUE, na.rm=T),
rank_retrieval_yr = min_rank(-TABLE_rec2),
ndis_TABLE_rec = tidytable::n_distinct(TABLE_rec2))|>
tidytable::ungroup() |>
tidytable::mutate(DECISION5= tidytable::case_when(
cnt_first_miss==0 & ndis_TABLE_rec>1 & rank_retrieval_yr!=1~ "1c.a.5.cases w/different retrieval yrs: earlier retrievals had no missing disch.dates, removed entries from previous retrieval yrs[*]",
cnt_first_miss==0 & ndis_TABLE_rec>1 & rank_retrieval_yr==1~ "1c.a.5.cases w/different retrieval yrs: earlier retrievals had no missing disch.dates, removed entries from previous retrieval yrs",
T~""))|>
tidytable::filter(DECISION5 == "1c.a.5.cases w/different retrieval yrs: earlier retrievals had no missing disch.dates, removed entries from previous retrieval yrs[*]")|>
pull(rn)
kept_1ca5 <-
kept_1ca4 |>
tidytable::mutate(TABLE_rec= ifelse(nchar(TABLE_rec) < 5,paste0(TABLE_rec, "0"), as.character(TABLE_rec)))|>
tidytable::mutate(TABLE_rec2= readr::parse_number(TABLE_rec)/10)|>
# Sort by hash and admission date from most recent to oldest, with the retrieval year of the database
# in descending order (starting with the most recent) and discharge date in descending order (most recent in the first row).
tidytable::arrange(hash_key, -adm_date_num, -TABLE_rec2, -disch_date)|>
tidytable::group_by(hash_key, adm_date_num)|>
tidytable::mutate(first_miss = row_number() == 1 & is.na(disch_date))|>
tidytable::mutate(
cnt_first_miss = sum(first_miss==TRUE, na.rm=T),
rank_retrieval_yr = min_rank(-TABLE_rec2),
ndis_TABLE_rec = tidytable::n_distinct(TABLE_rec2))|>
tidytable::ungroup() |>
tidytable::mutate(DECISION5= tidytable::case_when(
cnt_first_miss==0 & ndis_TABLE_rec>1 & rank_retrieval_yr!=1~ "1c.a.5.cases w/different retrieval yrs: earlier retrievals had no missing disch.dates, removed entries from previous retrieval yrs[*]",
cnt_first_miss==0 & ndis_TABLE_rec>1 & rank_retrieval_yr==1~ "1c.a.5.cases w/different retrieval yrs: earlier retrievals had no missing disch.dates, removed entries from previous retrieval yrs",
T~""))|>
tidytable::filter(!DECISION5 == "1c.a.5.cases w/different retrieval yrs: earlier retrievals had no missing disch.dates, removed entries from previous retrieval yrs[*]")
#more than one entry grouped, and where the amount of missing dits is different vs. the amount of rows.
#should be 18 cases
kept_1ca5 |> #
tidytable::group_by(hash_key, adm_date_num)|>
tidytable::mutate(count_miss_dit3 = sum(is.na(dit), na.rm=T),
ntot_hash_adm3 = n()) |>
tidytable::ungroup() |>
tidytable::filter(ntot_hash_adm3>1,ntot_hash_adm3!=count_miss_dit3) |> nrow()
#8
invisible("5b657cdf82ab1ca69e7e8f789bf8515d742731412354ea54ea67cf3d189173a4 este se debió haber ido antes, en el 4")
message(paste0("5.Groups that still have more than one entry: ",kept_1ca5 |> tidytable::group_by(hash_key, adm_date_num) |>
tidytable::summarise(n= n()) |>
tidytable::filter(n>1) |> nrow()))Code
message(paste0("5.Cases that still have more than one entry: ",
tidytable::group_by(kept_1ca5, hash_key, adm_date_num) |>
tidytable::summarise(n= n()) |>
tidytable::filter(n>1) |>
tidytable::ungroup() |>
summarise(sum=sum(n))))Code
# 5.Groups that still have more than one entry: 218
# 5.Cases that still have more than one entry: 538
disc_1ca6 <-
kept_1ca5 |>
tidytable::group_by(hash_key, adm_date_num)|>
#count amount missing days in treatment
#count number of rows
tidytable::mutate(count_miss_dit3 = sum(is.na(dit), na.rm=T),
ntot_hash_adm3 = n(),
ndis_TABLE_rec2 = tidytable::n_distinct(TABLE_rec2),
ndis_disch_date = tidytable::n_distinct(disch_date),
rn_hash_adm= row_number())|>
tidytable::ungroup() |>
#consider that the database is already ordered
tidytable::mutate(DECISION6= tidytable::case_when(
ndis_TABLE_rec2==1 & ndis_disch_date==1 & rn_hash_adm!=1~ "1c.a.6.cases w/ same retrieval yrs and disch. dates, removed entries from previous retrieval yrs[*]",
ndis_TABLE_rec2==1 & ndis_disch_date==1 & rn_hash_adm==1~ "1c.a.6.cases w/ same retrieval yrs and disch. dates, removed entries from previous retrieval yrs",T~""))|>
tidytable::filter(DECISION6 == "1c.a.6.cases w/ same retrieval yrs and disch. dates, removed entries from previous retrieval yrs[*]")|>
pull(rn)
kept_1ca6 <-
kept_1ca5 |>
tidytable::group_by(hash_key, adm_date_num)|>
#count amount missing days in treatment
#count number of rows
tidytable::mutate(count_miss_dit3 = sum(is.na(dit), na.rm=T),
ntot_hash_adm3 = n(),
ndis_TABLE_rec2 = tidytable::n_distinct(TABLE_rec2),
ndis_disch_date = tidytable::n_distinct(disch_date),
rn_hash_adm= row_number())|>
tidytable::ungroup() |>
#consider that the database is already ordered
tidytable::mutate(DECISION6= tidytable::case_when(
ndis_TABLE_rec2==1 & ndis_disch_date==1 & rn_hash_adm!=1~ "1c.a.6.cases w/ same retrieval yrs and disch. dates, removed entries from previous retrieval yrs[*]",
ndis_TABLE_rec2==1 & ndis_disch_date==1 & rn_hash_adm==1~ "1c.a.6.cases w/ same retrieval yrs and disch. dates, removed entries from previous retrieval yrs",T~""))|>
tidytable::filter(!DECISION6 == "1c.a.6.cases w/ same retrieval yrs and disch. dates, removed entries from previous retrieval yrs[*]")
message(paste0("6.Groups that still have more than one entry: ",kept_1ca6 |> tidytable::group_by(hash_key, adm_date_num) |>
tidytable::summarise(n= n()) |>
tidytable::filter(n>1) |> nrow()))Code
message(paste0("6.Cases that still have more than one entry: ",
tidytable::group_by(kept_1ca6, hash_key, adm_date_num) |>
tidytable::summarise(n= n()) |>
tidytable::filter(n>1) |>
tidytable::ungroup() |>
summarise(sum=sum(n))))Code
# 6.Groups that still have more than one entry: 214
# 6.Cases that still have more than one entry: 530
disc_1ca7 <-
kept_1ca6 |>
tidytable::group_by(hash_key, adm_date_num)|>
#count amount missing days in treatment
#count number of rows
tidytable::mutate(count_miss_dit4 = sum(is.na(dit), na.rm=T),
ntot_hash_adm4 = n(),
rn_hash_adm2= row_number())|>
tidytable::ungroup() |>
#consider that the database is already ordered
tidytable::mutate(DECISION7= tidytable::case_when(
count_miss_dit4==ntot_hash_adm4 & rn_hash_adm2!=1~ "1c.a.7.cases w/ only missing disch. dates, get the last sorted entry[*]",
count_miss_dit4==ntot_hash_adm4 & rn_hash_adm2!=1~ "1c.a.7.cases w/ only missing disch. dates, get the last sorted entry",T~""))|>
tidytable::filter(DECISION7 == "1c.a.7.cases w/ only missing disch. dates, get the last sorted entry[*]")|>
pull(rn)
kept_1ca7 <-
kept_1ca6 |>
tidytable::group_by(hash_key, adm_date_num)|>
#count amount missing days in treatment
#count number of rows
tidytable::mutate(count_miss_dit4 = sum(is.na(dit), na.rm=T),
ntot_hash_adm4 = n(),
rn_hash_adm2= row_number())|>
tidytable::ungroup() |>
#consider that the database is already ordered
tidytable::mutate(DECISION7= tidytable::case_when(
count_miss_dit4==ntot_hash_adm4 & rn_hash_adm2!=1~ "1c.a.7.cases w/ only missing disch. dates, get the last sorted entry[*]",
count_miss_dit4==ntot_hash_adm4 & rn_hash_adm2!=1~ "1c.a.7.cases w/ only missing disch. dates, get the last sorted entry",T~""))|>
tidytable::filter(!DECISION7 == "1c.a.7.cases w/ only missing disch. dates, get the last sorted entry[*]")
message(paste0("7.Groups that still have more than one entry: ",kept_1ca7 |> tidytable::group_by(hash_key, adm_date_num) |>
tidytable::summarise(n= n()) |>
tidytable::filter(n>1) |> nrow()))Code
message(paste0("7.Cases that still have more than one entry: ",
tidytable::group_by(kept_1ca7, hash_key, adm_date_num) |>
tidytable::summarise(n= n()) |>
tidytable::filter(n>1) |>
tidytable::ungroup() |>
summarise(sum=sum(n))))Code
#7.Groups that still have more than one entry: 0
#7.Cases that still have more than one entry: 0
#8127
invisible("APPLY discarding")
invisible("What about cases where all observations only have missing discharge dates?")
invisible("Case with 1 row having more missing values and two with fewer. Keep the row with fewer missing values - ID: 71f48564995f6bfea9f7793c56c40a51d0acc2d80f8822cfcde787597cc0be9e")
invisible("4-row case - ID: 006683df777b093473d92a24bb23ed3516ec0515bf8d91f55d947bda86ec7388")
invisible("Year 2011 - ID: 00bdd3cf594f1df14230d8a31a960997bda563c7fd5f1633ff870fda4c92a59b")
invisible("ID: 00bdd3cf594f1df14230d8a31a960997bda563c7fd5f1633ff870fda4c92a59b - different discharge dates, same plan, program, and center; one entry shows abandonment, the other administrative discharge")
invisible("ID: 01cf3f2d6f609a036f84ac9f735e07b15b3332c424cffe6fe64380866a495838 - same admission date, different centers and discharge dates")
invisible("ID: 01ca660418b3a4434e86270394c58b6850b05867c665ec9332f204f6b6418812 - changes discharge date, but plan and center remain the same")
invisible("ID: 02c866ee44e5a3a310cf18728753e3a4c3751d4ea4d61edc22d78606cde0fcc8 - same admission date and database, everything identical except for discharge date")
invisible("ID: 00bdd3cf594f1df14230d8a31a960997bda563c7fd5f1633ff870fda4c92a59b - same center, plan, and program, but reason and number of days differ")
invisible("If identical database and all but discharge date match, retain the entry with the longest duration")
invisible("ID: cd5ab631d5fc6a47222360f74c30f2bcfea75a517def58a5eaf2cbd507edddcc - database 2012 has more days than 2011, with no changes in plan, program, or center")[1] 8
From this part of the process, we removed 9340 entries from the database.
Initially, we aimed to apply a stricter selection of cases to retain records that represent a continuous sequence of treatments, including plan changes. However, this approach proved less meaningful, as we eventually intended to group them as a single treatment package. Therefore, we chose to follow the same selection criteria but have noted that these represent treatments involving a change of plan or program. This allows for later disaggregation; for example, if a given continuous treatment period exceeds 1,095 days—which indicates an extended duration that makes the treatment less plausible.
However, under certain conditions in step 3 (after filtering negative and missing dates) we evaluated if, for each group with the same concat value (combination of HASH and admission date), the row with the flag ([*], less days in treatment) had fewer missing values (n_col_miss) and fewer empty values (n_col_empty) than the non-flagged row. Additionally, the row must come from TABLE_rec greater than 2012, and the SENDA column value should not be “No”. If these criteria were met, we kept the row.
After keeping (discarding those that don’t met the criteria), we generated a column named adm_date_rec. If the rows meeting the conditions have been kept, we used the admission date from the row with the highest dit (non-flagged) that matches the disch_date of the row with the lowest dit; if not, retain the original adm_date. Next, generated a column called dit_rec, calculated as the difference in days between disch_date and adm_date_rec. This allowed us to construct differentiated continuous trajectories (treatment carreers).
Code
disc_1cb1<-
hash_adm_date_1cb|>
tidytable::group_by(hash_key, adm_date_num)|>
#there is variation in age records, so we kept a mean to keep a reference
tidytable::mutate(avg_age= mean(edad, na.rm=T))|>
tidytable::mutate(count_neg_dit = sum(dit<0, na.rm=T)) |>
tidytable::ungroup() |>
tidytable::mutate(
DECISION1= tidytable::case_when(
count_neg_dit>0 & count_neg_dit<ntot_hash_adm & dit<0~ "1c.b.1.cases w/neg days, removed neg. days[*]",
count_neg_dit>0 & count_neg_dit<ntot_hash_adm & dit>=0~ "1c.b.1.cases w/neg days, removed neg. days", T~""))|>
tidytable::filter(DECISION1 == "1c.b.1.cases w/neg days but w/ positive, removed neg. days[*]")|>
pull(rn)
invisible("No cases")
kept_1cb1<-
hash_adm_date_1cb|>
tidytable::group_by(hash_key, adm_date_num)|>
tidytable::mutate(avg_age= mean(edad, na.rm=T))|>
tidytable::mutate(count_neg_dit = sum(dit<0, na.rm=T))|>
#add concatenated type of plan and program
tidytable::mutate(tipode_plan_conc = paste(tipo_de_plan, collapse = ", "))|>
tidytable::mutate(tipode_prog_conc = paste(tipo_de_programa, collapse = ", "))|>
tidytable::ungroup()|>
tidytable::mutate(
DECISION1= tidytable::case_when(
count_neg_dit>0 & count_neg_dit<ntot_hash_adm & dit<0~ "1c.b.1.cases w/neg days, removed neg. days[*]",
count_neg_dit>0 & count_neg_dit<ntot_hash_adm & dit>=0~ "1c.b.1.cases w/neg days, removed neg. days", T~""))|>
tidytable::filter(!DECISION1 == "1c.b.1.cases w/neg days, removed neg. days[*]")
message(paste0("Groups that still have more than one entry: ",kept_1cb1 |> tidytable::group_by(hash_key, adm_date_num) |>
tidytable::summarise(n= n()) |>
tidytable::filter(n>1) |> nrow()))Code
message(paste0("Cases that still have more than one entry: ",
tidytable::group_by(kept_1cb1, hash_key, adm_date_num) |>
tidytable::summarise(n= n()) |>
tidytable::filter(n>1) |>
tidytable::ungroup() |>
summarise(sum=sum(n))))Code
disc_1cb2 <-
kept_1cb1 |>
tidytable::group_by(hash_key, adm_date_num)|>
tidytable::mutate(count_miss_dit = sum(is.na(dit), na.rm=T)) |>
tidytable::ungroup() |>
tidytable::mutate(DECISION2= tidytable::case_when(
count_miss_dit>0 & count_miss_dit<ntot_hash_adm & is.na(disch_date)~ "1c.b.2.cases w/missing discharge date, removed missed discharge dates[*]",
count_miss_dit>0 & count_miss_dit<ntot_hash_adm & !is.na(disch_date)~ "1c.b.2.cases w/missing discharge date, removed missed discharge dates", T~"")) |>
tidytable::filter(DECISION2 == "1c.b.2.cases w/missing discharge date, removed missed discharge dates[*]") |>
pull(rn)
kept_1cb2 <-
kept_1cb1 |>
tidytable::group_by(hash_key, adm_date_num)|>
tidytable::mutate(count_miss_dit = sum(is.na(dit), na.rm=T)) |>
tidytable::ungroup() |>
tidytable::mutate(DECISION2= tidytable::case_when(
count_miss_dit>0 & count_miss_dit<ntot_hash_adm & is.na(disch_date)~ "1c.b.2.cases w/missing discharge date, removed missed discharge dates[*]",
count_miss_dit>0 & count_miss_dit<ntot_hash_adm & !is.na(disch_date)~ "1c.b.2.cases w/missing discharge date, removed missed discharge dates", T~"")) |>
tidytable::filter(!DECISION2 == "1c.b.2.cases w/missing discharge date, removed missed discharge dates[*]")
message(paste0("Groups that still have more than one entry: ",
kept_1cb2 |> tidytable::group_by(hash_key, adm_date_num) |>
tidytable::summarise(n= n()) |>
tidytable::filter(n>1) |> nrow()))Code
message(paste0("Cases that still have more than one entry: ",
tidytable::group_by(kept_1cb2, hash_key, adm_date_num) |>
tidytable::summarise(n= n()) |>
tidytable::filter(n>1) |>
tidytable::ungroup() |>
summarise(sum=sum(n))))Code
if(kept_1cb1 |>
tidytable::group_by(hash_key, adm_date_num)|>
tidytable::mutate(count_miss_dit = sum(is.na(dit), na.rm=T)) |>
tidytable::ungroup() |>
tidytable::filter(count_miss_dit==ntot_hash_adm)|>
nrow()>0){stop(paste0("Cases with every missing dates:",
kept_1cb1 |>
tidytable::group_by(hash_key, adm_date_num)|>
tidytable::mutate(count_miss_dit = sum(is.na(dit), na.rm=T)) |>
tidytable::filter(count_miss_dit==ntot_hash_adm)|>
tidytable::ungroup() |> nrow()
))}
invisible("From now on, the algorithm whould change to keep entries")
disc_1cb3 <-
kept_1cb2 |>
#get cases w/more than one dit, but not every row is missing
tidytable::group_by(hash_key, adm_date_num)|>
tidytable::mutate(count_miss_dit2 = sum(is.na(dit), na.rm=T),
ndis_dit = tidytable::n_distinct(dit),
ntot_hash_adm2 = n(),
rank_by_dit = min_rank(-dit)) |>
tidytable::ungroup() |>
tidytable::mutate(DECISION3= tidytable::case_when(
ndis_dit>1 & count_miss_dit2<ntot_hash_adm2 & rank_by_dit!=1~ "1c.b.3.cases w/different discharge dates, removed entries w/ lower dit[*]",
ndis_dit>1 & count_miss_dit2<ntot_hash_adm2 & rank_by_dit==1~ "1c.b.3.cases w/different discharge dates, removed entries w/ lower dit",
T~""))|>
tidytable::filter(DECISION3 == "1c.b.3.cases w/different discharge dates, removed entries w/ lower dit[*]") |>
pull(rn)
kept_1cb3_5 <-
kept_1cb2 |>
#get cases w/more than one dit, but not every row is missing
tidytable::group_by(hash_key, adm_date_num)|>
tidytable::mutate(count_miss_dit2 = sum(is.na(dit), na.rm=T),
ndis_dit = tidytable::n_distinct(dit),
ntot_hash_adm2 = n(),
rank_by_dit = min_rank(-dit)) |>
tidytable::ungroup() |>
tidytable::mutate(DECISION3= tidytable::case_when(
ndis_dit>1 & count_miss_dit2<ntot_hash_adm2 & rank_by_dit!=1~ "1c.b.3.cases w/different discharge dates, removed entries w/ lower dit[*]",
ndis_dit>1 & count_miss_dit2<ntot_hash_adm2 & rank_by_dit==1~ "1c.b.3.cases w/different discharge dates, removed entries w/ lower dit",
T~""))|>
tidytable::mutate(TABLE_rec= ifelse(nchar(TABLE_rec) < 5,paste0(TABLE_rec, "0"), as.character(TABLE_rec)))|>
tidytable::mutate(TABLE_rec2= readr::parse_number(TABLE_rec)/10)|>
tidytable::group_by(concat) |>
tidytable::filter( (grepl("\\[\\*\\]", DECISION3) & TABLE_rec2 > 2012 & senda != "No" & n_col_miss < max(n_col_miss[!grepl("\\[\\*\\]", DECISION3)], na.rm = TRUE) & n_col_empty < max(n_col_empty[!grepl("\\[\\*\\]", DECISION3)], na.rm = TRUE))| !grepl("\\[\\*\\]", DECISION3))|>
#add admission date
tidytable::ungroup() |>
tidytable::group_by(concat) |>
#if there is a flagged case, there are no missing discharge dates and days in treatment are not the shorter, then replace the admission date with the discharge date of the shorter treatment and calculate the days in treatment
tidytable::mutate(
adm_date_rec = case_when(
any(grepl("\\[\\*\\]", DECISION3)) & !is.na(disch_date[which.min(dit)]) & dit!=dit[which.min(dit)] ~
adm_date[which.max(dit)] + as.numeric(disch_date[which.min(dit)] - adm_date[which.max(dit)]),
TRUE ~ adm_date)
) |>
tidytable::ungroup() |>
tidytable::mutate(dit_rec = as.numeric(disch_date - adm_date_rec), adm_date_rec_num= as.numeric(adm_date_rec)) |>
tidytable::ungroup() |>
#0 days in tr.
# [1] "52eeaf6394e67f4d8957ea733e1aeb6cf82c759a8e77f00eb24e685c95fc60d2"
# [2] "66415234677fd4f8454fe77532a56a5ac9dbdeadb6766539a1ce06fdd33b83f7"
tidytable::filter(
tidytable::case_when(grepl("\\[\\*\\]", DECISION3) &
dit_rec==0~F,T~T))
#To check strange cases
# kept_1cb3_5 |> #349
# tidytable::group_by(concat) |>
# filter(any(grepl("\\[\\*\\]", DECISION3))) |> View()
message(paste0("3.5.Groups that still have more than one entry: ",
kept_1cb3_5 |> tidytable::group_by(hash_key, adm_date_rec_num)|>
tidytable::summarise(n= n())|>
tidytable::filter(n>1)|> nrow()))Code
message(paste0("3.5.Cases that still have more than one entry: ",
tidytable::group_by(kept_1cb3_5, hash_key, adm_date_rec_num)|>
tidytable::summarise(n= n())|>
tidytable::filter(n>1)|>
tidytable::ungroup() |>
summarise(sum=sum(n))))Code
# Groups that still have more than one entry: 5
# Cases that still have more than one entry: 10
#f4dbe23abde5d2bb5a48d69a9e3caebb470754a8b4185997022b307b08fa0b26, originally 3 cases, delete the lower dit, left 2
kept_1cb3 <-
kept_1cb2 |>
#get cases w/more than one dit, but not every row is missing
tidytable::group_by(hash_key, adm_date_num)|>
tidytable::mutate(count_miss_dit2 = sum(is.na(dit), na.rm=T),
ndis_dit = tidytable::n_distinct(dit),
ntot_hash_adm2 = n(),
rank_by_dit = min_rank(-dit)) |>
tidytable::ungroup() |>
tidytable::mutate(DECISION3= tidytable::case_when(
ndis_dit>1 & count_miss_dit2<ntot_hash_adm2 & rank_by_dit!=1~ "1c.b.3.cases w/different discharge dates, removed entries w/ lower dit[*]",
ndis_dit>1 & count_miss_dit2<ntot_hash_adm2 & rank_by_dit==1~ "1c.b.3.cases w/different discharge dates, removed entries w/ lower dit",
T~""))|>
tidytable::filter(!DECISION3 == "1c.b.3.cases w/different discharge dates, removed entries w/ lower dit[*]") |>
tidytable::mutate(adm_date_rec= adm_date) |>
tidytable::mutate(dit_rec = as.numeric(disch_date - adm_date_rec),
adm_date_rec_num= as.numeric(adm_date_rec))
message(paste0("3.Groups that still have more than one entry: ",
kept_1cb3 |> tidytable::group_by(hash_key, adm_date_num) |>
tidytable::summarise(n= n()) |>
tidytable::filter(n>1) |> nrow()))Code
message(paste0("3.Cases that still have more than one entry: ",
tidytable::group_by(kept_1cb3, hash_key, adm_date_num) |>
tidytable::summarise(n= n()) |>
tidytable::filter(n>1) |>
tidytable::ungroup() |>
summarise(sum=sum(n))))Code
#Groups that still have more than one entry: 5
#Cases that still have more than one entry: 10
kept_1cb3_3_5 <- bind_rows(
kept_1cb3 %>% mutate(source = "kept_1cb3"),
kept_1cb3_5 %>% mutate(source = "kept_1cb3_5")
) %>%
#order by hash_key, original admission date and source
tidytable::arrange(hash_key, adm_date, desc(source)) %>% # Primero ordenamos por la fuente, priorizando kept_1cb3_5
tidytable::distinct(rn, .keep_all = TRUE) %>% # Nos quedamos con el primer valor único de rn, que será de kept_1cb3_5 si hay duplicados
tidytable::select(-source) # Eliminamos la columna auxiliar
message(paste0("3 & 3.5.Groups that still have more than one entry: ",
bind_rows(kept_1cb3_3_5) |> tidytable::group_by(hash_key, adm_date_rec_num) |>
tidytable::summarise(n= n()) |>
tidytable::filter(n>1) |> nrow()))Code
message(paste0("3 & 3.5.Cases that still have more than one entry: ",
tidytable::group_by(bind_rows(kept_1cb3_3_5), hash_key, adm_date_rec_num) |>
tidytable::summarise(n= n()) |>
tidytable::filter(n>1) |>
tidytable::ungroup() |>
summarise(sum=sum(n))))Code
#3 & 3.5.Groups that still have more than one entry: 22
#3 & 3.5.Cases that still have more than one entry: 44
#if cases with one entry are the same using adm_date_num and adm_date_num_rec, the replacement is bad done
#3 & 3.5.Groups that still have more than one entry: 5
#3 & 3.5.Cases that still have more than one entry: 10
disc_1cb4 <-
kept_1cb3_3_5 |>
#filter days in treatment with missing values
#tidytable::filter(is.na(dit)) |>
#rank by missing and empty columns
tidytable::group_by(hash_key, adm_date_rec_num)|>
tidytable::mutate(
rank_by_missing = min_rank(n_col_miss),
rank_by_empty = min_rank(n_col_empty),
ndis_miss_data = tidytable::n_distinct(n_col_miss),
ndis_empty_data = tidytable::n_distinct(n_col_empty)) |>
tidytable::ungroup() |>
tidytable::mutate(DECISION4= tidytable::case_when(
(ndis_miss_data>1 | ndis_empty_data>1) & rank_by_missing!=1~ "1c.b.4.cases w/different amount of missing data, removed entries w/ more missingness[*]",
(ndis_miss_data>1 | ndis_empty_data>1) & rank_by_missing==1~ "1c.b.4.cases w/different amount of missing data, removed entries w/ more missingness",
T~""))|>
tidytable::filter(DECISION4 == "1c.b.4.cases w/different amount of missing data, removed entries w/ more missingness[*]") |>
pull(rn)
kept_1cb4 <-
kept_1cb3_3_5 |>
#filter days in treatment with missing values
#tidytable::filter(is.na(dit)) |>
#rank by missing and empty columns
tidytable::group_by(hash_key, adm_date_rec_num)|>
tidytable::mutate(
rank_by_missing = min_rank(n_col_miss),
rank_by_empty = min_rank(n_col_empty),
ndis_miss_data = tidytable::n_distinct(n_col_miss),
ndis_empty_data = tidytable::n_distinct(n_col_empty)) |>
tidytable::ungroup() |>
tidytable::mutate(DECISION4= tidytable::case_when(
(ndis_miss_data>1 | ndis_empty_data>1) & rank_by_missing!=1~ "1c.b.4.cases w/different amount of missing data, removed entries w/ more missingness[*]",
(ndis_miss_data>1 | ndis_empty_data>1) & rank_by_missing==1~ "1c.b.4.cases w/different amount of missing data, removed entries w/ more missingness",
T~""))|>
tidytable::filter(!DECISION4 == "1c.b.4.cases w/different amount of missing data, removed entries w/ more missingness[*]")
message(paste0("4.Groups that still have more than one entry: ",kept_1cb4 |> tidytable::group_by(hash_key, adm_date_rec_num) |>
tidytable::summarise(n= n()) |>
tidytable::filter(n>1) |> nrow()))Code
message(paste0("4.Cases that still have more than one entry: ",
tidytable::group_by(kept_1cb4, hash_key, adm_date_rec_num) |>
tidytable::summarise(n= n()) |>
tidytable::filter(n>1) |>
tidytable::ungroup() |>
summarise(sum=sum(n))))Code
#4.Groups that still have more than one entry: 3
#4.Cases that still have more than one entry: 6
disc_1cb5 <-
kept_1cb4 |>
tidytable::mutate(TABLE_rec= ifelse(nchar(TABLE_rec) < 5,paste0(TABLE_rec, "0"), as.character(TABLE_rec)), TABLE_rec= as.numeric(TABLE_rec)/10)|>
# Sort by hash and admission date from most recent to oldest, with the retrieval year of the database
# in descending order (starting with the most recent) and discharge date in descending order (most recent in the first row).
tidytable::arrange(hash_key, -adm_date_rec_num, -TABLE_rec, -disch_date)|>
tidytable::group_by(hash_key, adm_date_rec_num)|>
tidytable::mutate(first_miss = row_number() == 1 & is.na(disch_date))|>
tidytable::mutate(
cnt_first_miss = sum(first_miss==TRUE, na.rm=T),
rank_retrieval_yr = min_rank(-TABLE_rec),
ndis_TABLE_rec = tidytable::n_distinct(TABLE_rec))|>
tidytable::ungroup() |>
tidytable::mutate(DECISION5= tidytable::case_when(
cnt_first_miss==0 & ndis_TABLE_rec>1 & rank_retrieval_yr!=1~ "1c.b.5.cases w/different retrieval yrs: earlier retrievals had no missing disch.dates, removed entries from previous retrieval yrs[*]",
cnt_first_miss==0 & ndis_TABLE_rec>1 & rank_retrieval_yr==1~ "1c.b.5.cases w/different retrieval yrs: earlier retrievals had no missing disch.dates, removed entries from previous retrieval yrs",
T~""))|>
tidytable::filter(DECISION5 == "1c.b.5.cases w/different retrieval yrs: earlier retrievals had no missing disch.dates, removed entries from previous retrieval yrs[*]")|>
pull(rn)
kept_1cb5 <-
kept_1cb4 |>
tidytable::mutate(TABLE_rec= ifelse(nchar(TABLE_rec) < 5,paste0(TABLE_rec, "0"), as.character(TABLE_rec)), TABLE_rec= as.numeric(TABLE_rec)/10)|>
# Sort by hash and admission date from most recent to oldest, with the retrieval year of the database
# in descending order (starting with the most recent) and discharge date in descending order (most recent in the first row).
tidytable::arrange(hash_key, -adm_date_rec_num, -TABLE_rec, -disch_date)|>
tidytable::group_by(hash_key, adm_date_rec_num)|>
tidytable::mutate(first_miss = row_number() == 1 & is.na(disch_date))|>
tidytable::mutate(
cnt_first_miss = sum(first_miss==TRUE, na.rm=T),
rank_retrieval_yr = min_rank(-TABLE_rec),
ndis_TABLE_rec = tidytable::n_distinct(TABLE_rec))|>
tidytable::ungroup() |>
tidytable::mutate(DECISION5= tidytable::case_when(
cnt_first_miss==0 & ndis_TABLE_rec>1 & rank_retrieval_yr!=1~ "1c.b.5.cases w/different retrieval yrs: earlier retrievals had no missing disch.dates, removed entries from previous retrieval yrs[*]",
cnt_first_miss==0 & ndis_TABLE_rec>1 & rank_retrieval_yr==1~ "1c.b.5.cases w/different retrieval yrs: earlier retrievals had no missing disch.dates, removed entries from previous retrieval yrs",
T~""))|>
tidytable::filter(!DECISION5 == "1c.b.5.cases w/different retrieval yrs: earlier retrievals had no missing disch.dates, removed entries from previous retrieval yrs[*]")
message(paste0("5.Groups that still have more than one entry: ",kept_1cb5 |> tidytable::group_by(hash_key, adm_date_rec_num) |>
tidytable::summarise(n= n()) |>
tidytable::filter(n>1) |> nrow()))Code
message(paste0("5.Cases that still have more than one entry: ",
tidytable::group_by(kept_1cb5, hash_key, adm_date_rec_num) |>
tidytable::summarise(n= n()) |>
tidytable::filter(n>1) |>
tidytable::ungroup() |>
summarise(sum=sum(n))))Code
# Groups that still have more than one entry: 0
# Cases that still have more than one entry: 0
disc_1cb6 <-
kept_1cb5 |>
tidytable::group_by(hash_key, adm_date_rec_num)|>
#count amount missing days in treatment
#count number of rows
tidytable::mutate(count_miss_dit3 = sum(is.na(dit), na.rm=T),
ntot_hash_adm3 = n(),
ndis_TABLE_rec2 = tidytable::n_distinct(TABLE_rec),
ndis_disch_date = tidytable::n_distinct(disch_date),
rn_hash_adm= row_number())|>
tidytable::ungroup() |>
#consider that the database is already ordered
tidytable::mutate(DECISION6= tidytable::case_when(
ndis_TABLE_rec2==1 & ndis_disch_date==1 & rn_hash_adm!=1~ "1c.b.6.cases w/ same retrieval yrs and disch. dates, removed entries from previous retrieval yrs[*]",
ndis_TABLE_rec2==1 & ndis_disch_date==1 & rn_hash_adm==1~ "1c.b.6.cases w/ same retrieval yrs and disch. dates, removed entries from previous retrieval yrs",T~""))|>
tidytable::filter(DECISION6 == "1c.b.6.cases w/ same retrieval yrs and disch. dates, removed entries from previous retrieval yrs[*]")|>
pull(rn)
kept_1cb6 <-
kept_1cb5 |>
tidytable::group_by(hash_key, adm_date_rec_num)|>
#count amount missing days in treatment
#count number of rows
tidytable::mutate(count_miss_dit3 = sum(is.na(dit_rec), na.rm=T),
ntot_hash_adm3 = n(),
ndis_TABLE_rec2 = tidytable::n_distinct(TABLE_rec),
ndis_disch_date = tidytable::n_distinct(disch_date),
rn_hash_adm= row_number())|>
tidytable::ungroup() |>
#consider that the database is already ordered
tidytable::mutate(DECISION6= tidytable::case_when(
ndis_TABLE_rec2==1 & ndis_disch_date==1 & rn_hash_adm!=1~ "1c.b.6.cases w/ same retrieval yrs and disch. dates, removed entries from previous retrieval yrs[*]",
ndis_TABLE_rec2==1 & ndis_disch_date==1 & rn_hash_adm==1~ "1c.b.6.cases w/ same retrieval yrs and disch. dates, removed entries from previous retrieval yrs",T~""))|>
tidytable::filter(!DECISION6 == "1c.b.6.cases w/ same retrieval yrs and disch. dates, removed entries from previous retrieval yrs[*]")
message(paste0("6.Groups that still have more than one entry: ",kept_1cb6 |> tidytable::group_by(hash_key, adm_date_rec_num) |>
tidytable::summarise(n= n()) |>
tidytable::filter(n>1) |> nrow()))Code
message(paste0("6.Cases that still have more than one entry: ",
tidytable::group_by(kept_1cb6, hash_key, adm_date_rec_num) |>
tidytable::summarise(n= n()) |>
tidytable::filter(n>1) |>
tidytable::ungroup() |>
summarise(sum=sum(n))))Code
# Groups that still have more than one entry: 0
# Cases that still have more than one entry: 0
disc_1cb7 <-
kept_1cb6 |>
tidytable::group_by(hash_key, adm_date_rec_num)|>
#count amount missing days in treatment
#count number of rows
tidytable::mutate(count_miss_dit4 = sum(is.na(dit_rec), na.rm=T),
ntot_hash_adm4 = n(),
rn_hash_adm2= row_number())|>
tidytable::ungroup() |>
#consider that the database is already ordered
tidytable::mutate(DECISION7= tidytable::case_when(
count_miss_dit4==ntot_hash_adm4 & rn_hash_adm2!=1~ "1c.b.7.cases w/ only missing disch. dates, get the last sorted entry[*]",
count_miss_dit4==ntot_hash_adm4 & rn_hash_adm2!=1~ "1c.b.7.cases w/ only missing disch. dates, get the last sorted entry",T~""))|>
tidytable::filter(DECISION7 == "1c.b.7.cases w/ only missing disch. dates, get the last sorted entry[*]")|>
pull(rn)
kept_1cb7 <-
kept_1cb6 |>
tidytable::group_by(hash_key, adm_date_rec_num)|>
#count amount missing days in treatment
#count number of rows
tidytable::mutate(count_miss_dit4 = sum(is.na(dit), na.rm=T),
ntot_hash_adm4 = n(),
rn_hash_adm2= row_number())|>
tidytable::ungroup() |>
#consider that the database is already ordered
tidytable::mutate(DECISION7= tidytable::case_when(
count_miss_dit4==ntot_hash_adm4 & rn_hash_adm2!=1~ "1c.b.7.cases w/ only missing disch. dates, get the last sorted entry[*]",
count_miss_dit4==ntot_hash_adm4 & rn_hash_adm2!=1~ "1c.b.7.cases w/ only missing disch. dates, get the last sorted entry",T~""))|>
tidytable::filter(!DECISION7 == "1c.b.7.cases w/ only missing disch. dates, get the last sorted entry[*]")
message(paste0("7.Groups that still have more than one entry: ",kept_1cb7 |> tidytable::group_by(hash_key, adm_date_rec_num) |>
tidytable::summarise(n= n()) |>
tidytable::filter(n>1) |> nrow()))Code
message(paste0("7.Cases that still have more than one entry: ",
tidytable::group_by(kept_1cb7, hash_key, adm_date_rec_num) |>
tidytable::summarise(n= n()) |>
tidytable::filter(n>1) |>
tidytable::ungroup() |>
summarise(sum=sum(n))))From this part of the process, we removed 356 entries from the database.
Code
kept_1ca<-
kept_1ca7[,c("rn",paste0("DECISION",1:7),"avg_age","avg_birth_date_num","avg_onset_age","avg_primary_sub_onset_age")] %>%
tidytable::mutate(
obs_1ca = apply(.[, paste0("DECISION", 1:7)], 1, function(row) {
glue::glue_collapse(row, sep = "::")
})) |>
tidytable::select(-any_of(paste0("DECISION",1:7))) |>
tidytable::mutate(obs_1ca=gsub(":+", ";", obs_1ca)) |>
tidytable::mutate(obs_1ca=gsub("^;", "", obs_1ca)) |>
tidytable::mutate(obs_1ca=gsub(";", "; ", obs_1ca))
kept_1cb<-
kept_1cb7[,c("rn",paste0("DECISION",1:7), "tipode_plan_conc", "tipode_prog_conc", "avg_age", "avg_birth_date_num", "avg_onset_age", "avg_primary_sub_onset_age", "adm_date_rec", "dit_rec", "adm_date_rec_num")]%>%
tidytable::mutate(
obs_1cb = apply(.[, paste0("DECISION", 1:7)], 1, function(row) {
glue::glue_collapse(row, sep = "::")
})) |>
tidytable::select(-any_of(paste0("DECISION",1:7))) |>
tidytable::mutate(obs_1cb=gsub(":+", ";", obs_1cb)) |>
tidytable::mutate(obs_1cb=gsub("^;", "", obs_1cb)) |>
tidytable::mutate(obs_1cb=gsub(";", "; ", obs_1cb))
invisible("====================================================")
invisible("eliminate discarded entries")
SISTRAT23_c1_2010_2022_df_prev1c<-
SISTRAT23_c1_2010_2022_df_prev1b|>
tidytable::filter(!rn %in% setdiff(hash_adm_date_1ca$rn ,kept_1ca7$rn))|>
tidytable::filter(!rn %in% setdiff(hash_adm_date_1cb$rn ,kept_1cb7$rn)) |>
tidytable::mutate(adm_date_num=as.numeric(as.Date(adm_date)),
disch_date_num= as.numeric(as.Date(disch_date)),
dit= disch_date_num-adm_date_num)|>
#join rows and info. of the filtered rows
tidylog::left_join(kept_1ca, by="rn") |>
tidylog::left_join(kept_1cb, by="rn") |>
#add info to the observation column
tidytable::mutate(OBS = tidytable::case_when( !is.na(obs_1ca)~ glue("{OBS};{obs_1ca}"),T~OBS), OBS = tidytable::case_when(!is.na(obs_1cb)~ glue("{OBS};{obs_1cb}"), T~OBS)) |>
#fill columns
tidytable::mutate(
avg_age = case_when(!is.na(avg_age.x) ~ avg_age.x, T ~ avg_age.y),
avg_birth_date_num = case_when(!is.na(avg_birth_date_num.x) ~ avg_birth_date_num.x, T ~ avg_birth_date_num.y),
avg_onset_age = case_when(!is.na(avg_onset_age.x) ~ avg_onset_age.x, T ~ avg_onset_age.y),
avg_primary_sub_onset_age = case_when(!is.na(avg_primary_sub_onset_age.x) ~ avg_primary_sub_onset_age.x, T ~ avg_primary_sub_onset_age.y),
adm_date_rec = case_when(!is.na(adm_date_rec)~adm_date_rec, T~adm_date),
dit_rec = case_when(!is.na(dit_rec)~dit_rec, T~dit),
adm_date_rec_num = case_when(!is.na(adm_date_rec_num)~adm_date_rec_num, T~adm_date_num)
) |>
#correct observation column
tidytable::mutate(OBS= gsub("^;", "", OBS)) |>
tidytable::mutate(OBS= gsub("^;", "", OBS)) |>
tidytable::mutate(OBS= gsub("^;", "", OBS)) |>
tidytable::mutate(OBS= gsub("^;", "", OBS)) |>
tidytable::mutate(OBS= gsub("^;", "", OBS)) |>
#eliminate residual columns
tidytable::select(-obs_1ca, -obs_1cb, -perfect_dup, -any_of( c("avg_age.x", "avg_birth_date_num.x", "avg_onset_age.x", "avg_primary_sub_onset_age.x", "tipode_plan_conc", "tipode_prog_conc", "avg_age.y", "avg_birth_date_num.y", "avg_onset_age.y", "avg_primary_sub_onset_age.y"))) |>
tidytable::filter(!hash_key== "c46caa3cd2c89a2222ce319cf6f5e98392f928e0544ee5487ff6bedc1d3e76c2")Code
if(
SISTRAT23_c1_2010_2022_df_prev1c |>
tidytable::group_by(hash_key, adm_date_rec_num) |>
tidytable::summarise(n=n()) |>
tidytable::filter(n>1) |> nrow()>1){stop("There are still duplicated entries with the same HASH and admission date")}c46caa3cd2c89a2222ce319cf6f5e98392f928e0544ee5487ff6bedc1d3e76c2 was excluded from the analysis because it did not have information on the admission date.
2. Age in Datasets
Age is a unit-invariant variable that can help standardize users and, in turn, differentiate treatments. Up to this point, we had only used fuzzy criteria to distinguish between different admissions. Therefore, a casuistic approach was necessary to identify duplicate treatments through probabilistic deduplication. These strategies are illustrated in the primary data preparation diagram. However, many cases have invalid (n = 443) or missing ages (0). What’s more intriguing were the inconsistent ages within the same HASH values (n = 9,645).
Differences between dates were calculated by converting them into numeric values, using the “Unix epoch” (1970-01-01) as the reference point.
2.1.Rule-based solution to inconsistent dates of birth
Given the inaccuracies and inconsistencies regarding participants’ birth dates and ages, information from additional sources was used, particularly hospital records, mortality registries, Treatment Outcome Profile questionnaires, and SENDA Agreements 2 to 6, in cases where the patient had participated in one of these previously. This approach allows us to retain the most plausible dates and have greater certainty about participants’ ages when experiencing significant health events, based on available records.
Instead of looking on ages, highly dependent on retrieval date of the database, we looked at birth dates. To get this information, we extracted the last 8 characters from the codigo_identificacion (SENDA ID). 3,713 patients had inconsistent birth dates.
We successfully integrated the Prosecutor’s Office records (Base_fiscalia_v2) by linking an older version of the encrypted national ID (datasets sended in May 2023) with the current ID using 98 matching characteristics shared by both SENDA databases (process available in “import_c1_top_data_adm_24.qmd”). For validation, we relied on attributes that are theoretically invariant for patients: sex and birth date. We excluded records with missing birth dates (e.g., entries showing 1900-01-01) and, in cases of discrepancies within this dataset, calculated the average birth date.
Code
# [1] "3,713" patients had inconsistent birth dates
#base::load(paste0(envpath,"data/20241015_out/","3_ndp_2024_11_08.Rdata"))
invisible("======================================================")
hashs_inconsistent_ages<-
SISTRAT23_c1_2010_2022_df_prev1c|> tidytable::group_by(hash_key)|> tidytable::summarise(n = n_distinct(edad))|> tidytable::filter(n > 1)|> pull(hash_key)
invisible("======================================================")
invisible("2024-11-09: many had didfferent ages but same birthdates. Change the criteria")
message(paste0("Patients with only missing values in birth date: ", SISTRAT23_c1_2010_2022_df_prev1c|> tidytable::group_by(hash_key)|> tidytable::summarise(n= n(), na_birth= sum(is.na(birth_date), na.rm=T))|> tidytable::ungroup()|> tidytable::filter(na_birth==n)|> nrow()))Code
hashs_inconsistent_birth_dates<-
SISTRAT23_c1_2010_2022_df_prev1c|> tidytable::group_by(hash_key)|> tidytable::summarise(n= tidytable::n_distinct(birth_date))|> tidytable::filter(n > 1)|> pull(hash_key)
hashs_invalid_birth_dates<-
SISTRAT23_c1_2010_2022_df_prev1c|> tidytable::group_by(hash_key)|> tidytable::summarise(n= tidytable::n_distinct(birth_date))|> tidytable::filter(n > 1)|> pull(hash_key)
SISTRAT23_c1_2010_2022_df_prev1c<-
SISTRAT23_c1_2010_2022_df_prev1c|>
(\(df) {
print(message(paste0("Missing birth dates, Entries: ", nrow(df|> tidytable::filter(is.na(birth_date))))))
print(message(paste0("Missing birth dates, RUNs: ", tidytable::distinct(tidytable::filter(df,is.na(birth_date)), hash_key)|> nrow())))
df
})()|>
tidytable::group_by(hash_key)|>
tidytable::mutate(
birth_date = tidytable::coalesce(birth_date, birth_date[!is.na(birth_date)][1])
)|>
tidytable::ungroup() Code
# Missing birth dates, Entries: 0
# Missing birth dates, RUNs: 0
invisible("No entries with missing birth dates")
invisible("======================================================")
#HOSP_filter_df edad_anos y run
#Edad en años del paciente al momento del ingreso
HOSP_filter_df$fecha_nac<-
clock::add_years(HOSP_filter_df$fecha_ingreso,-HOSP_filter_df$edad_anos, invalid = "previous") #if invalid day, e.g., 1991-02-29, for 1991-02-28
#00328debf19b4829db5c12d9aa428dbe922e0bd7b46bda1bcc483aa80234a2bb, sin fecha de ingreso, NaN hospitalizaciones
HOSP_filter_df$fecha_ingreso <- ifelse(is.na(HOSP_filter_df$fecha_ingreso) & !is.na(HOSP_filter_df$fecha_egreso) & !is.na(HOSP_filter_df$dias_estad), clock::add_years(HOSP_filter_df$fecha_egreso, -HOSP_filter_df$dias_estad, invalid = "previous"), HOSP_filter_df$fecha_ingreso)
HOSP_filter_df$fecha_ingreso <- as.Date(HOSP_filter_df$fecha_ingreso)
HOSP_filter_df$fecha_nac<- clock::add_years(HOSP_filter_df$fecha_ingreso,-HOSP_filter_df$edad_anos, invalid = "previous") #if invalid day,
inconsistent_hashs_hosp<-
HOSP_filter_df |>
tidytable::filter(run %in% hashs_inconsistent_birth_dates) |>
(\(df) {
print(message(paste0("Hospital, Entries: ", nrow(df))))
print(message(paste0("Hospital, RUNs: ", tidytable::distinct(df, run) |> nrow())))
df
})() |>
tidytable::distinct(run, fecha_nac) |>
tidytable::group_by(run) |>
tidytable::mutate(id = as.character(dplyr::row_number())) |>
tidytable::pivot_wider(names_from = id, values_from = fecha_nac,
names_prefix = "h_fechnac_")Code
invisible("Since this measure depended on the admission day and month, we calculated the average birth date.")
inconsistent_hashs_hosp_avg<-
HOSP_filter_df |>
tidytable::filter(run %in% hashs_inconsistent_birth_dates)|>
tidytable::distinct(run, fecha_nac)|>
tidytable::group_by(run)|>
tidytable::summarise(h_avg_birth_date = as.Date(mean(as.numeric(fecha_nac), na.rm=T), origin="1970-01-01"))|>
tidytable::ungroup()
#Hospital, Entries: 9957
#Hospital, RUNs: 2593
invisible("======================================================")
#SISTRAT23_top_2015_2022_df$fecha_nacimiento
inconsistent_hashs_top<-
SISTRAT23_top_2015_2022_df |>
tidytable::filter(HASH_KEY %in% hashs_inconsistent_birth_dates) |>
(\(df) {
print(message(paste0("TOP, Entries: ", nrow(df))))
print(message(paste0("TOP, RUNs: ", tidytable::distinct(df, HASH_KEY) |> nrow())))
df
})() |>
tidytable::distinct(HASH_KEY, birth_date) |>
tidytable::ungroup() |>
tidytable::group_by(HASH_KEY) |>
tidytable::mutate(id = as.character(dplyr::row_number())) |>
tidytable::pivot_wider(names_from = id, values_from = birth_date,
names_prefix = "t_fechnac_")Code
# TOP, Entries: 10938
# TOP, RUNs: 2321
invisible("======================================================")
#CONS_C2$fecha_nacimiento
CONS_C2$birth_date<-readr::parse_date(CONS_C2$fecha_nacimiento,"%d/%m/%Y")
inconsistent_hashs_c2<-
CONS_C2 |>
tidytable::filter(HASH_KEY %in% hashs_inconsistent_birth_dates) |>
(\(df) {
print(message(paste0("C2, Entries: ", nrow(df))))
print(message(paste0("C2, RUNs: ", tidytable::distinct(df, HASH_KEY) |> nrow())))
df
})() |>
tidytable::distinct(HASH_KEY, birth_date) |>
tidytable::ungroup() |>
tidytable::group_by(HASH_KEY) |>
tidytable::mutate(id = as.character(dplyr::row_number())) |>
tidytable::pivot_wider(names_from = id, values_from = birth_date,
names_prefix = "c2_fechnac_")Code
# C2, Entries: 37
# C2, RUNs: 21
invisible("======================================================")
#CONS_C3$edad
CONS_C3$birth_date<-
stringr::str_sub(CONS_C3$codigo_identificacion, nchar(CONS_C3$codigo_identificacion)-7,nchar(CONS_C3$codigo_identificacion))
CONS_C3$birth_date<- readr::parse_date(CONS_C3$birth_date, format="%d%m%Y")
inconsistent_hashs_c3<-
CONS_C3 |>
tidytable::filter(HASH_KEY %in% hashs_inconsistent_birth_dates) |>
(\(df) {
print(message(paste0("C3, Entries: ", nrow(df))))
print(message(paste0("C3, RUNs: ", tidytable::distinct(df, HASH_KEY) |> nrow())))
df
})() |>
tidytable::distinct(HASH_KEY, birth_date) |>
tidytable::ungroup() |>
tidytable::group_by(HASH_KEY) |>
tidytable::mutate(id = as.character(dplyr::row_number())) |>
tidytable::pivot_wider(names_from = id, values_from = birth_date,
names_prefix = "c3_fechnac_")Code
# C3, Entries: 64
# C3, RUNs: 50
invisible("======================================================")
#CONS_C4$fechanacimiento
CONS_C4$birth_date <- readr::parse_date(CONS_C4$fechanacimiento,"%d/%m/%Y")
inconsistent_hashs_c4<-
CONS_C4 |>
tidytable::filter(HASH_KEY %in% hashs_inconsistent_birth_dates) |>
(\(df) {
print(message(paste0("C4, Entries: ", nrow(df))))
print(message(paste0("C4, RUNs: ", tidytable::distinct(df, HASH_KEY) |> nrow())))
df
})() |>
tidytable::distinct(HASH_KEY, birth_date) |>
tidytable::ungroup() |>
tidytable::group_by(HASH_KEY) |>
tidytable::mutate(id = as.character(dplyr::row_number())) |>
tidytable::pivot_wider(names_from = id, values_from = birth_date,
names_prefix = "c4_fechnac_")Code
# C4, Entries: 37
# C4, RUNs: 26
invisible("======================================================")
#CONS_C5$fecha_nacimiento
CONS_C5$birth_date<-readr::parse_date(CONS_C5$fecha_nacimiento,"%d/%m/%Y")
inconsistent_hashs_c5<-
CONS_C5 |>
tidytable::filter(HASH_KEY %in% hashs_inconsistent_birth_dates) |>
(\(df) {
print(message(paste0("C5, Entries: ", nrow(df))))
print(message(paste0("C5, RUNs: ", tidytable::distinct(df, HASH_KEY) |> nrow())))
df
})() |>
tidytable::distinct(HASH_KEY, birth_date) |>
tidytable::ungroup() |>
tidytable::group_by(HASH_KEY) |>
tidytable::mutate(id = as.character(dplyr::row_number())) |>
tidytable::pivot_wider(names_from = id, values_from = birth_date,
names_prefix = "c5_fechnac_")Code
# C5, Entries: 7
# C5, RUNs: 6
invisible("======================================================")
#CONS_C6$fechanacimiento
CONS_C6$birth_date<-readr::parse_date(CONS_C6$fechanacimiento,"%d/%m/%Y")
inconsistent_hashs_c6<-
CONS_C6 |>
tidytable::filter(HASH_KEY %in% hashs_inconsistent_birth_dates) |>
(\(df) {
print(message(paste0("C6, Entries: ", nrow(df))))
print(message(paste0("C6, RUNs: ", tidytable::distinct(df, HASH_KEY) |> nrow())))
df
})() |>
tidytable::distinct(HASH_KEY, birth_date) |>
tidytable::ungroup() |>
tidytable::group_by(HASH_KEY) |>
tidytable::mutate(id = as.character(dplyr::row_number())) |>
tidytable::pivot_wider(names_from = id, values_from = birth_date,
names_prefix = "c6_fechnac_")Code
#C6, Entries: 10
#C6, RUNs: 9
invisible("======================================================")
inconsistent_hashs_mortality<-
mortality |>
tidytable::filter(hashkey %in% hashs_inconsistent_birth_dates) |>
(\(df) {
print(message(paste0("Mortality, Entries: ", nrow(df))))
print(message(paste0("Mortality, RUNs: ", tidytable::distinct(df, hashkey) |> nrow())))
df
})() |>
tidytable::distinct(hashkey, birth_date) |>
tidytable::ungroup() |>
tidytable::rename("m_birthdate"="birth_date")Code
#Mortality, Entries: 171
#Mortality, RUNs: 171
invisible("======================================================")
wdpath<-
paste0(gsub("/cons","",gsub("cons","",paste0(getwd(),"/cons"))))
wdpath
envpath<- if(regmatches(wdpath, regexpr("[A-Za-z]+", wdpath))=="G"){"G:/Mi unidad/Alvacast/SISTRAT 2023/"}else{"E:/Mi unidad/Alvacast/SISTRAT 2023/"}
envpath
invisible("Construct PO Office database")
Base_fiscalia_v2 <- readxl::read_excel(paste0(sub("2023","2019 \\(github\\)",wdpath),"/Base_v3_dic_2021.xlsx"),
sheet = "Base", skip = 4, guess_max = min(1000000, Inf))|> janitor::clean_names()|>
dplyr::mutate(gls_region=dplyr::case_when(gls_region=="REGION METROPOLITANA CENTRO NORTE"~ "RM Centro Norte",
gls_region=="REGION METROPOLITANA OCCIDENTE"~ "RM Occidente",
gls_region=="REGION METROPOLITANA ORIENTE"~ "RM Oriente",
gls_region=="REGION METROPOLITANA SUR"~ "RM Sur",
T~gls_region)) %>%
dplyr::mutate(region_delito=dplyr::case_when(region_delito=="REGION METROPOLITANA CENTRO NORTE"~ "RM Centro Norte",
region_delito=="REGION METROPOLITANA OCCIDENTE"~ "RM Occidente",
region_delito=="REGION METROPOLITANA ORIENTE"~ "RM Oriente",
region_delito=="REGION METROPOLITANA SUR"~ "RM Sur",
T~region_delito))|>
dplyr::mutate(fec_comision_simple=as.Date(stringr::str_extract(as.character(fec_comision), "^.{10}")))|>
dplyr::mutate(fec_cbiorelacion_simple=as.Date(stringr::str_extract(as.character(fec_cbiorelacion), "^.{10}")))|>
dplyr::mutate(fec_nacimiento_simple=as.Date(stringr::str_extract(as.character(fec_nacimiento), "^.{10}")))|>
dplyr::mutate(termino_relacion_simple=as.Date(stringr::str_extract(as.character(termino_relacion), "^.{10}")))|>
dplyr::rename("marca_suspension_43"="marca_suspension_46","marca_pena_44"="marca_pena_47","marca_multa_45"="marca_multa_48","medida_alternativa_46"="medida_alternativa_49","clasificacion_pena_47"="clasificacion_pena_50","tramos_condena_48"="tramos_condena","clasificacion_penarpa_1_49"="clasificacion_penarpa_1_52","clasificacion_penarpa_2_50"="clasificacion_penarpa_2_53","marca_suspension_51"="marca_suspension_54","marca_pena_52"="marca_pena_55","marca_multa_53"="marca_multa_56","medida_alternativa_54"="medida_alternativa_57","clasificacion_pena_55"="clasificacion_pena_58","tramos_condena_56"="tramos_condena_2","clasificacion_penarpa_1_57"="clasificacion_penarpa_1_60","clasificacion_penarpa_2_58"="clasificacion_penarpa_2_61")|>
dplyr::mutate(edad_comision=(unclass(fec_comision_simple)-unclass(fec_nacimiento_simple))/365.25,
edad_ter_rel=(unclass(termino_relacion_simple)-unclass(fec_nacimiento_simple))/365.25)Code
Base_fiscalia_v2<-
Base_fiscalia_v2[,c("rut_enc_saf","fec_nacimiento_simple","sexo")]|>
tidytable::filter(fec_nacimiento_simple!="1900-01-01")|>
tidytable::group_by(rut_enc_saf)|>
tidytable::mutate(avg_birth_date_po = mean(fec_nacimiento_simple, na.rm = TRUE), n_dis_birth_date_po= n_distinct(fec_nacimiento_simple))|>
tidytable::ungroup()
inconsistent_hashs_may23_PO_office<-
OLD_NEW_SISTRAT23_c1_2010_2022_df2|>
tidylog::right_join(Base_fiscalia_v2, by=c("HASH_KEY.y"="rut_enc_saf"))|>
tidytable::select("HASH_KEY.x","HASH_KEY.y", "sexo.y","avg_birth_date_po")|>
tidytable::filter(HASH_KEY.x %in% hashs_inconsistent_birth_dates)|>
(\(df) {
print(message(paste0("PO Office, Entries: ", nrow(df))))
print(message(paste0("PO Office, RUNs: ", tidytable::distinct(df, HASH_KEY.x) |> nrow())))
df
})() #|> Code
# tidytable::distinct(HASH_KEY.x, avg_birth_date_po) |>
# tidytable::ungroup()
#Office, Entries: 120446
#PO Office, RUNs: 3288
#from source(paste0(getwd(), "/_alt_scripts/fix_dfs2023_2024_c1.R"))
cat("2025-05-30= loaded other databases \n\n")
source(paste0(getwd(), "/_alt_scripts/fix_dfs2023_2024_c1.R"))Warning: There was 1 warning in dplyr::mutate(). ℹ In argument: birth_date = readr::parse_date(birth_date, format = "%d%m%Y"). Caused by warning: ! 3 parsing failures. row col expected actual 25686 – valid date 00000000 25688 – valid date 00000000 26445 – valid date 00000000
Warning: There was 1 warning in dplyr::mutate(). ℹ In argument: senda_adm_date = readr::parse_date(senda_adm_date, format = "%d-%m-%Y"). Caused by warning: ! 1 parsing failure. row col expected actual 9166 – date like %d-%m-%Y nan-nan-na
Warning: There was 1 warning in dplyr::mutate(). ℹ In argument: discharge_date = readr::parse_date(discharge_date, format = "%d-%m-%Y"). Caused by warning: ! 4 parsing failures. row col expected actual 285 – date like %d-%m-%Y nan-nan-nan 8290 – date like %d-%m-%Y nan-nan-nan 8409 – date like %d-%m-%Y nan-nan-nan 25002 – date like %d-%m-%Y nan-nan-nan
Code
inconsistent_hashs_c1_2324<-
SISTRAT23_c1_2023_2024_df2 |>
tidytable::filter(hash_key %in% hashs_inconsistent_birth_dates) |>
(\(df) {
print(message(paste0("C1 2023-24, Entries: ", nrow(df))))
print(message(paste0("C1 2023-24, RUNs: ", tidytable::distinct(df, hash_key) |> nrow())))
df
})() |>
tidytable::distinct(hash_key, birth_date) |>
tidytable::ungroup() |>
tidytable::group_by(hash_key) |>
tidytable::mutate(id = as.character(dplyr::row_number())) |>
tidytable::pivot_wider(names_from = id, values_from = birth_date,
names_prefix = "c1_2324_fechnac_")Code
inconsistent_hashs_top_2324<-
top_2224 |>
tidytable::filter(hashkey %in% hashs_inconsistent_birth_dates) |>
(\(df) {
print(message(paste0("TOP 2023-24, Entries: ", nrow(df))))
print(message(paste0("TOP 2023-24, RUNs: ", tidytable::distinct(df, hashkey) |> nrow())))
df
})() |>
tidytable::distinct(hashkey, fecha_nacimiento) |>
tidytable::ungroup() |>
tidytable::group_by(hashkey) |>
tidytable::mutate(id = as.character(dplyr::row_number())) |>
tidytable::pivot_wider(names_from = id, values_from = fecha_nacimiento,
names_prefix = "c1_2324_fechnac_")Code
inconsistent_hashs_c2_2224<-
c2_2324 |>
tidytable::filter(hashkey %in% hashs_inconsistent_birth_dates) |>
(\(df) {
print(message(paste0("C2 2023-24, Entries: ", nrow(df))))
print(message(paste0("C2 2023-24, RUNs: ", tidytable::distinct(df, hashkey) |> nrow())))
df
})() |>
tidytable::distinct(hashkey, fecha_nacimiento) |>
tidytable::ungroup() |>
tidytable::group_by(hashkey) |>
tidytable::mutate(id = as.character(dplyr::row_number())) |>
tidytable::pivot_wider(names_from = id, values_from = fecha_nacimiento,
names_prefix = "c2_2224_fechnac_")NULL
NULL
NULL
NULL
NULL
NULL
NULL
NULL
NULL
NULL
NULL
NULL
NULL
NULL
NULL
NULL
NULL
NULL
[1] "G:/My Drive/Alvacast/SISTRAT 2023//"
[1] "G:/Mi unidad/Alvacast/SISTRAT 2023/"
NULL
NULL
2025-05-30= loaded other databases
We take the new database of 2019 and try to standardize according to formatting made previous to the first step of the deduplication phaseNULL
NULL
NULL
NULL
NULL
NULL
We joined the birth dates in wide format.
Code
inconsistent_hashs_h_to_m<-
SISTRAT23_c1_2010_2022_df_prev1c|>
tidytable::filter(hash_key %in% hashs_inconsistent_birth_dates)|>
tidylog::left_join(inconsistent_hashs_hosp_avg, by=c("hash_key"="run"))|>
tidytable::select(hash_key, birth_date, h_avg_birth_date)|>
tidylog::left_join(inconsistent_hashs_top, by=c("hash_key"="HASH_KEY"))|>
tidylog::left_join(inconsistent_hashs_c2, by=c("hash_key"="HASH_KEY"))|>
tidylog::left_join(inconsistent_hashs_c3, by=c("hash_key"="HASH_KEY"))|>
tidylog::left_join(inconsistent_hashs_c4, by=c("hash_key"="HASH_KEY"))|>
tidylog::left_join(inconsistent_hashs_c5, by=c("hash_key"="HASH_KEY"))|>
tidylog::left_join(inconsistent_hashs_c6, by=c("hash_key"="HASH_KEY"))|>
tidylog::left_join(inconsistent_hashs_mortality, by=c("hash_key"="hashkey"))|>
tidylog::left_join(inconsistent_hashs_c1_2324, by=c("hash_key"="hash_key"))|>
tidylog::left_join(inconsistent_hashs_top_2324, by=c("hash_key"="hashkey"))|>
tidylog::left_join(inconsistent_hashs_c2_2224, by=c("hash_key"="hashkey"))|>
tidytable::mutate_rowwise(
non_NA_count = rowSums(!is.na(across(h_avg_birth_date:c2_2224_fechnac_1)))
) |>
dplyr::filter(non_NA_count>0)|>
(\(df) {
print(message(paste0("Inconsistent birth date that have at least one external birth date, Entries: ", nrow(df))))
print(message(paste0("Inconsistent birth date that have at least one external birth date, RUNs: ", tidytable::distinct(df, hash_key) |> nrow())))
df
})() Code
#25/02/24
#nconsistent birth date that have at least one external birth date, Entries: 10146
#Inconsistent birth date that have at least one external birth date, RUNs: 3316
#
#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:
#:#:#:#:#:#:#:#:#:
invisible("Export database to explore it")
wdpath<-
paste0(gsub("/cons","",gsub("cons","",paste0(getwd(),"/cons"))))
envpath<- if(regmatches(wdpath, regexpr("[A-Za-z]+", wdpath))=="G"){"G:/Mi unidad/Alvacast/SISTRAT 2023/"}else{"E:/Mi unidad/Alvacast/SISTRAT 2023/"}
envpath
rio::export(file=paste0(wdpath,"cons/_out/inconsistent_birthdates.csv"),inconsistent_hashs_h_to_m)NULL
NULL
[1] "G:/Mi unidad/Alvacast/SISTRAT 2023/"
Then we constructed the joined birth dates in long format.
Code
inconsistent_hashs_h_to_m_long<-
inconsistent_hashs_h_to_m|>
tidytable::group_by(hash_key)|>
tidytable::mutate(source_birth= dplyr::row_number())|>
tidytable::ungroup()|>
tidytable::select(hash_key, source_birth, everything())|>
tidytable::mutate(source_birth= paste0("orig_",source_birth))|>
tidytable::select(hash_key, source_birth, birth_date)|>
#first add hospital data
tidytable::bind_rows(cbind.data.frame(hash_key= inconsistent_hashs_hosp_avg$run, source_birth= rep("hosp_avg",times= nrow(inconsistent_hashs_hosp_avg)), birth_date= inconsistent_hashs_hosp_avg$h_avg_birth_date))|>
tidytable::bind_rows(cbind.data.frame(hash_key= reshape2::melt(inconsistent_hashs_top, id="HASH_KEY")$HASH_KEY, source_birth= as.character(reshape2::melt(inconsistent_hashs_top, id="HASH_KEY")$variable), birth_date= reshape2::melt(inconsistent_hashs_top, id="HASH_KEY")$value))|>
tidytable::bind_rows(cbind.data.frame(hash_key= reshape2::melt(inconsistent_hashs_c2, id="HASH_KEY")$HASH_KEY, source_birth= as.character(reshape2::melt(inconsistent_hashs_c2, id="HASH_KEY")$variable), birth_date= reshape2::melt(inconsistent_hashs_c2, id="HASH_KEY")$value))|>
tidytable::bind_rows(cbind.data.frame(hash_key= reshape2::melt(inconsistent_hashs_c3, id="HASH_KEY")$HASH_KEY, source_birth= as.character(reshape2::melt(inconsistent_hashs_c3, id="HASH_KEY")$variable), birth_date= reshape2::melt(inconsistent_hashs_c3, id="HASH_KEY")$value))|>
tidytable::bind_rows(cbind.data.frame(hash_key= reshape2::melt(inconsistent_hashs_c4, id="HASH_KEY")$HASH_KEY, source_birth= as.character(reshape2::melt(inconsistent_hashs_c4, id="HASH_KEY")$variable), birth_date= reshape2::melt(inconsistent_hashs_c4, id="HASH_KEY")$value))|>
tidytable::bind_rows(cbind.data.frame(hash_key= reshape2::melt(inconsistent_hashs_c5, id="HASH_KEY")$HASH_KEY, source_birth= as.character(reshape2::melt(inconsistent_hashs_c5, id="HASH_KEY")$variable), birth_date= reshape2::melt(inconsistent_hashs_c5, id="HASH_KEY")$value))|> tidytable::bind_rows(cbind.data.frame(hash_key= reshape2::melt(inconsistent_hashs_c6, id="HASH_KEY")$HASH_KEY, source_birth= as.character(reshape2::melt(inconsistent_hashs_c6, id="HASH_KEY")$variable), birth_date= reshape2::melt(inconsistent_hashs_c6, id="HASH_KEY")$value))|>
tidytable::bind_rows(cbind.data.frame(hash_key= reshape2::melt(inconsistent_hashs_mortality, id="hashkey")$hashkey, source_birth= as.character(reshape2::melt(inconsistent_hashs_mortality, id="hashkey")$variable), birth_date= reshape2::melt(inconsistent_hashs_mortality, id="hashkey")$value))|>
tidytable::bind_rows(cbind.data.frame(hash_key= reshape2::melt(inconsistent_hashs_c1_2324, id="hash_key")$hash_key, source_birth= as.character(reshape2::melt(inconsistent_hashs_c1_2324, id="hash_key")$variable), birth_date= reshape2::melt(inconsistent_hashs_c1_2324, id="hash_key")$value))|>
tidytable::bind_rows(cbind.data.frame(hash_key= reshape2::melt(inconsistent_hashs_c2_2224, id="hashkey")$hashkey, source_birth= as.character(reshape2::melt(inconsistent_hashs_c2_2224, id="hashkey")$variable), birth_date= reshape2::melt(inconsistent_hashs_c2_2224, id="hashkey")$value))|>
tidytable::bind_rows(cbind.data.frame(hash_key= reshape2::melt(inconsistent_hashs_top_2324, id="hashkey")$hashkey, source_birth= as.character(reshape2::melt(inconsistent_hashs_top_2324, id="hashkey")$variable), birth_date= reshape2::melt(inconsistent_hashs_top_2324, id="hashkey")$value))|>
tidytable::arrange(hash_key, source_birth)|>
tidytable::filter(!is.na(birth_date))|>
tidytable::left_join(distinct(subset(SISTRAT23_c1_2010_2022_df_prev1c, select=c("hash_key", "adm_date_rec")), hash_key, .keep_all = T), by="hash_key")|> tidytable::mutate(adm_yr=round(as.numeric((adm_date_rec-birth_date))/365.25,2),
adm_yr= tidytable::case_when(adm_yr<16~"Less16", adm_yr>90~"More90", T~""))
inconsistent_hashs_h_to_m_long$adm_date_rec<-NULL
#inconsistent_hashs_c1_2324 inconsistent_hashs_top_2324 inconsistent_hashs_c2_2224
#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:
#:#:#:#:#:#:#:#:#:
invisible("Export database to explore it")
wdpath<-
paste0(gsub("/cons","",gsub("cons","",paste0(getwd(),"/cons"))))
envpath<- if(regmatches(wdpath, regexpr("[A-Za-z]+", wdpath))=="G"){"G:/Mi unidad/Alvacast/SISTRAT 2023/"}else{"E:/Mi unidad/Alvacast/SISTRAT 2023/"}
envpath
rio::export(file=paste0(wdpath,"cons/_out/inconsistent_birthdates_long.csv"),inconsistent_hashs_h_to_m_long)
invisible("sample")
inconsistent_hashs_h_to_m_long|>
tidytable::filter(hash_key %in% dplyr::pull(sample_n_with_seed(data.frame(hashs_inconsistent_birth_dates),100, seed=2125),1)) |>
rio::export(file=paste0(wdpath,"cons/_out/inconsistent_birthdates_long_sample.csv"))[1] "G:/Mi unidad/Alvacast/SISTRAT 2023/"
Code
inconsistent_hashs_h_to_m_no_ext<-
SISTRAT23_c1_2010_2022_df_prev1c|>
tidytable::filter(hash_key %in% hashs_inconsistent_birth_dates)|>
tidylog::left_join(inconsistent_hashs_hosp_avg, by=c("hash_key"="run"))|>
tidytable::select(hash_key, birth_date, h_avg_birth_date)|>
tidylog::left_join(inconsistent_hashs_top, by=c("hash_key"="HASH_KEY"))|>
tidylog::left_join(inconsistent_hashs_c2, by=c("hash_key"="HASH_KEY"))|>
tidylog::left_join(inconsistent_hashs_c3, by=c("hash_key"="HASH_KEY"))|>
tidylog::left_join(inconsistent_hashs_c4, by=c("hash_key"="HASH_KEY"))|>
tidylog::left_join(inconsistent_hashs_c5, by=c("hash_key"="HASH_KEY"))|>
tidylog::left_join(inconsistent_hashs_c6, by=c("hash_key"="HASH_KEY"))|>
tidylog::left_join(inconsistent_hashs_mortality, by=c("hash_key"="hashkey"))|>
tidylog::left_join(inconsistent_hashs_c1_2324, by=c("hash_key"="hash_key"))|>
tidylog::left_join(inconsistent_hashs_top_2324, by=c("hash_key"="hashkey"))|>
tidylog::left_join(inconsistent_hashs_c2_2224, by=c("hash_key"="hashkey"))|>
tidytable::mutate_rowwise(
non_NA_count = rowSums(!is.na(across(h_avg_birth_date:c2_2224_fechnac_1)))
) |>
dplyr::filter(non_NA_count==0)|>
(\(df) {
print(message(paste0("Inconsistent birth date that did not have external birth dates, Entries: ", nrow(df))))
print(message(paste0("Inconsistent birth date that did not have external birth dates, RUNs: ", tidytable::distinct(df, hash_key) |> nrow())))
df
})() NULL
NULL
Code
#Inconsistent birth date that did not have external birth dates, Entries: 967
#Inconsistent birth date that did not have external birth dates, RUNs: 397
invisible("Get PO Office for inconsistent dates")
PO_brith_dates_for_inconsistent_dates <-
OLD_NEW_SISTRAT23_c1_2010_2022_df2|>
tidylog::right_join(Base_fiscalia_v2, by=c("HASH_KEY.y"="rut_enc_saf"))|>
tidytable::select("HASH_KEY.x","HASH_KEY.y", "sexo.y","avg_birth_date_po")|>
tidytable::filter(HASH_KEY.x %in% hashs_inconsistent_birth_dates)|>
(\(df) {
print(message(paste0("PO Office, Entries: ", nrow(df))))
print(message(paste0("PO Office, RUNs: ", tidytable::distinct(df, HASH_KEY.x)|> nrow())))
df
})()|>
tidytable::distinct(HASH_KEY.x, avg_birth_date_po)|>
tidytable::ungroup()NULL
NULL
Code
#PO Office, Entries: 120446
#PO Office, RUNs: 3288
invisible("Joined hashs with inconsistent birth dates with no external data")
invisible("with info of PO Office")
inconsistent_hashs_h_to_m_no_ext_long<-
inconsistent_hashs_h_to_m_no_ext|>
tidytable::group_by(hash_key)|>
tidytable::mutate(source_birth= dplyr::row_number())|>
tidytable::ungroup()|>
tidytable::select(hash_key, source_birth, everything())|>
tidytable::mutate(source_birth= paste0("orig_",source_birth))|>
tidytable::select(hash_key, source_birth, birth_date)|>
tidytable::left_join(PO_brith_dates_for_inconsistent_dates, by=c("hash_key"="HASH_KEY.x"))|>
tidytable::arrange(hash_key, source_birth)|>
tidytable::filter(!is.na(birth_date))|>
tidytable::left_join(distinct(subset(SISTRAT23_c1_2010_2022_df_prev1c, select=c("hash_key", "adm_date_rec")), hash_key, .keep_all = T), by="hash_key")|> tidytable::mutate(adm_yr=round(as.numeric((adm_date_rec-birth_date))/365.25,2), diff = abs(as.numeric(difftime(birth_date, avg_birth_date_po, units = "days"))))Code
plot_inconsistent_ages_flowchart <- DiagrammeR::grViz("
digraph {
graph [layout = dot, rankdir = TB]
# Global node attributes
node [shape = box, fontname = Helvetica]
# Nodes
start [label = 'Start (2.1.1)', shape = circle]
filter_age [label = 'Filter birth dates with year of admission at <16 or >90\\nunless dates are consistent within ±2 years with other birth dates']
check_year_diff [label = 'Calculate year difference within group']
year_diff_na [label = 'Is difference in years missing?', shape=diamond]
log_error [label = 'Log error (year_diff is NA)']
decision_year_diff [label = 'Is difference in\nyears > 2?', shape=diamond]
discard_inconsistent [label = '2.1.1.a.Discard entries with\ninconsistent dates']
count_orig [label = 'Count most frequent birth dates within C1 source']
one_common_date [label = 'Is there exactly one\nmost common date in C1 source?', shape=diamond]
multiple_common_dates [label = 'Are there multiple most common\ndates in C1 source?', shape=diamond]
no_common_date [label = 'No common dates\nin C1 source']
calculate_avg [label = 'Calculate average of\nmost common dates overall']
select_closest [label = 'Select closest date\namong C1 dates to average']
flag_inconsistent [label = 'Flag record (no\nvalid birth date determined)']
assign_selected_date [label = 'Assign selected_birth_date']
end [label = 'End', shape = circle]
# Set Start and End to be on the same horizontal rank
# Define subgraph for top and bottom alignment of Start and End
{ rank = min; start; }
{ rank = max; end; }
# Connections
start -> filter_age
filter_age -> check_year_diff
check_year_diff -> year_diff_na
year_diff_na -> log_error [label = 'Yes']
log_error -> count_orig
year_diff_na -> decision_year_diff [label = 'No']
decision_year_diff -> discard_inconsistent [label = 'Yes']
discard_inconsistent -> count_orig
decision_year_diff -> count_orig [label = 'No']
count_orig -> one_common_date
one_common_date -> assign_selected_date [label = 'Yes (2.1.1.b)']
assign_selected_date -> end
one_common_date -> multiple_common_dates [label = 'No']
multiple_common_dates -> calculate_avg [label = 'Yes (2.1.1.c)']
multiple_common_dates -> no_common_date [label = 'No']
no_common_date -> flag_inconsistent
flag_inconsistent -> end
calculate_avg -> select_closest
select_closest -> assign_selected_date
}
",
width = 800,
height = 900)
plot_inconsistent_ages_flowchart
#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:
#:#:#:#:#:#:#:#:#:
invisible("Export database to explore it")
wdpath<-
paste0(gsub("/cons","",gsub("cons","",paste0(getwd(),"/cons"))))
envpath<- if(regmatches(wdpath, regexpr("[A-Za-z]+", wdpath))=="G"){"G:/Mi unidad/Alvacast/SISTRAT 2023/"}else{"E:/Mi unidad/Alvacast/SISTRAT 2023/"}
envpath
# WidthCM<-8
# HeightCM<-6
# DPI<-600
unlink(paste0(wdpath,"cons/_figs/inconsistent_birthdates_flowchart_files"), recursive = TRUE)
htmlwidgets::saveWidget(plot_inconsistent_ages_flowchart, paste0(wdpath,"cons/_figs/inconsistent_birthdates_flowchart.html"))
webshot::webshot(paste0(wdpath,"cons/_figs/inconsistent_birthdates_flowchart.html"),paste0(wdpath,"cons/_figs/inconsistent_birthdates_flowchart.png"), vwidth = 300*1.2, vheight = 300, zoom=10, expand=100) # Prueba con diferentes coordenadas top, left, width, and height.Code
#https://stackoverflow.com/questions/1554635/graphviz-how-to-have-a-subgraph-be-left-to-right-when-main-graph-is-top-to-bot
#https://stackoverflow.com/questions/65509087/diagrammer-flowchart-align-vertical-nodes
#https://stackoverflow.com/questions/39451158/how-to-specify-vertical-alignment-of-nodes-in-r-package-diagrammer
#https://stackoverflow.com/questions/64323943/graphviz-and-dot-files-horizontal-and-vertical-node-alignment-intervening-node
#https://stackoverflow.com/questions/5424555/changing-edge-direction-in-dot
#https://graphviz.org/docs/attrs/rankdir/Workflow for rule-based selection of consistent birth dates (w/ext. data)
[1] "G:/Mi unidad/Alvacast/SISTRAT 2023/"
Code
invisible("1. <16 & >90 (adm_yr), might be wrong ages ")
invisible("2. mfv within c1, if there is only one birth date with rank 1, that is, with no ties in the most frequent value")
invisible("3. closeness of one of candidate birthdates to the external value")
invisible("4. retrieval year")
invisible("3. Closest date to external average selected due to tie")
invisible("5. mean?")
invisible("(Some of the external dbs are not reliable due to imprecise quantity in the birth date)")
error_log <- data.frame(
hash_key = character(),
orig_mean = numeric(),
non_orig_mean = numeric(),
year_diff = numeric(),
issue_description = character(),
stringsAsFactors = FALSE
)
group_1690_log <- data.frame(
hash_key = character(),
desc = character(),
stringsAsFactors = FALSE
)
group_mfv1_log <- data.frame(
hash_key = character(),
desc = character(),
stringsAsFactors = FALSE
)
group_mfv2_log <- data.frame(
hash_key = character(),
desc = character(),
stringsAsFactors = FALSE
)
invisible("1. discard birth dates with adm_yr at <16 & >90 , unless every other birth dates are similar (-/+2 yrs) or lower 2. select the most frequent value within orig_ birth dates (source_birth) as long as there is more than 1 case in each frequent value; if there is only one birth date with rank 1, that is, with no ties in the most frequent value 3. if there are 2 ties or more, select the value within orig_ in source_birth which is closer to an external date or the average of the external dates 4. if 1, 2,3 did not apply, flag them in a new column")
# Define a function to process each 'hash_key' group
process_birthdates <- function(group, hash_key) {
library(dplyr)
library(lubridate)
hash_key <- unique(group$hash_key)
#Remove extremely implausible birth dates
#Not necessary because of the first filter
# group <- group %>%
# mutate(
# birth_date = ifelse(
# birth_date <= as.Date("1910-01-01") | birth_date >= as.Date("2010-01-01"),
# NA,
# birth_date
# )
# )
# Filter out "Less16" and "More90" unless dates within group are consistent within ±2 years
# Ver grupos con alguna edad al ingreso aberrante, tomar el máximo de ese birth date vs. el mínimo. Si la diferencia entre el mínimo y máximo es mayor a 2, sacar esa edad
# AGERGAMOS DONDE LA FUENTE TENGA ORIG
# Filter out "Less16" and "More90" unless dates within group are consistent within ±2 years
if(any(group$adm_yr %in% c("Less16", "More90"))) {
# Calculate year difference between 'orig' and non-'orig' entries
orig_mean <- mean(year(group$birth_date[group$adm_yr %in% c("Less16", "More90")]), na.rm = TRUE)
non_orig_mean <- mean(year(group$birth_date[!group$adm_yr %in% c("Less16", "More90")]), na.rm = TRUE)
year_diff <- abs(orig_mean - non_orig_mean)
# If year_diff is NA, log this case in the error_log data frame
if(is.na(year_diff)) {
error_log <<- rbind(error_log, data.frame(
hash_key = hash_key,
orig_mean = orig_mean,
non_orig_mean = non_orig_mean,
year_diff = year_diff,
issue_description = "year_diff is NA due to missing values in orig or non-orig group",
stringsAsFactors = FALSE
))
}
# Check that year_diff is not NA and |diff| in years >2 before filtering
if(!is.na(year_diff) && year_diff > 2) {
group <- group %>% dplyr::filter(!adm_yr %in% c("Less16", "More90"))
group_1690_log <<- rbind(group_1690_log, data.frame(
hash_key = hash_key,
desc = "Less16|More90, removed rows due to >2 |diff|",
stringsAsFactors = FALSE
))
}
}
# Apply selection rules based on frequency and consistency
# If there is only one most common date among in C1, then this is the selected birth date
# if there are more most common dates, we borrow information from these other external birth dates
# Get the most frequent birth date among original sources
birth_counts <- table(group$birth_date[grepl("orig", group$source_birth)])
if(length(birth_counts) > 0) {
most_common_dates <- names(birth_counts)[which(birth_counts == max(birth_counts))]
} else {
most_common_dates <- NA
}
# 'Count most frequent birth dates within C1 source'
# Decision: 'Is there exactly one most common date in C1 source?'
# Get the most frequent birth date overall
birth_counts_os <- table(group$birth_date)
if(length(birth_counts_os) > 0) {
most_common_dates_os <- names(birth_counts_os)[which(birth_counts_os == max(birth_counts_os))]
} else {
most_common_dates_os <- NA
}
# Apply selection rules based on frequency and consistency
# If there is only one most common date among in C1, then this is the selected birth date= When there's exactly one most common date, the code assigns it as the selected_birth_date and sets the flag to FALSE.
# if there are more most common dates, we borrow information from these other external birth dates
if(length(most_common_dates) == 1) {
group <- group %>%
dplyr::mutate(sel_birth_date = most_common_dates[1], flag = FALSE)
group_mfv1_log <<- rbind(group_mfv1_log, data.frame(
hash_key = hash_key,
desc = "The most common date is selected as the birth date",
stringsAsFactors = FALSE
))
} else if(length(most_common_dates) >= 2) {
avg_date <- as.Date(mean(as.numeric(as.Date(most_common_dates_os)), na.rm=T), origin = "1970-01-01")
closest_date <- most_common_dates[which.min(abs(as.Date(most_common_dates) - avg_date))]
group <- group %>%
dplyr::mutate(sel_birth_date = closest_date, flag = FALSE)
group_mfv2_log <<- rbind(group_mfv2_log, data.frame(
hash_key = hash_key,
desc = "Multiple common dates found. Select the birth date closest to available external records",
stringsAsFactors = FALSE
))
} else {
group <- group %>%
dplyr::mutate(sel_birth_date = NA, flag = TRUE)
}
return(group)
}
#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_
#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_
invisible("2024-11-18: Add PO to the candidate brith dates")
invisible("Consolidated dataframe of birthdates")
proc_birthdates_ext_data <-
inconsistent_hashs_h_to_m_long|>
tidylog::left_join(dplyr::distinct(inconsistent_hashs_may23_PO_office, HASH_KEY.x, avg_birth_date_po), by= c("hash_key"="HASH_KEY.x"), multiple="first")|>
dplyr::filter(!is.na(avg_birth_date_po))|>
dplyr::select(hash_key, avg_birth_date_po)|>
dplyr::mutate(source_birth = "avg_birth_date_po", birth_date = avg_birth_date_po, adm_yr = "")|>
dplyr::select(-avg_birth_date_po)|>
dplyr::distinct(hash_key, birth_date) |>
dplyr::mutate(adm_yr="", source_birth="avg_po_date") |>
dplyr::select(hash_key, source_birth, birth_date, adm_yr) |>
(\(df) {
dplyr::bind_rows(inconsistent_hashs_h_to_m_long, df)
})()|>
dplyr::arrange(hash_key, source_birth) |>
dplyr::group_split(hash_key)|>
purrr::map_dfr(~ process_birthdates(.x))|>
tidylog::left_join(group_1690_log, by="hash_key", multiple="first")|>
tidylog::left_join(group_mfv1_log, by="hash_key", multiple="first")|>
tidylog::left_join(group_mfv2_log, by="hash_key", multiple="first")|>
dplyr::rename("obs1"="desc.x","obs2"="desc.y","obs3"="desc")|>
dplyr::mutate(obs1= ifelse(!is.na(obs1),paste0("2.1.1.a.",obs1),obs1))|>
dplyr::mutate(obs2= ifelse(!is.na(obs2),paste0("2.1.1.b.",obs2),obs2))|>
dplyr::mutate(obs3= ifelse(!is.na(obs3),paste0("2.1.1.c.",obs3),obs3))|>
dplyr::mutate(obs = purrr::pmap_chr(list(obs1, obs2, obs3), ~ paste(na.omit(c(...)), collapse = " ;")))Code
#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_
#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_
message(paste0("Number of cases in error log: ", nrow(error_log)))Code
if(nrow(error_log)>0){
#a534782e4a670be9fe584f9b1b47a7f38a73f6b08f18aa7a7b4a45a7ba628dbf
inconsistent_hashs_h_to_m_long|>
dplyr::filter(hash_key %in% error_log$hash_key) |>
knitr::kable("markdown", caption="HASHs with no records >16|<90 yrs old at admission to treatment")
}
invisible("Flags")
message(paste0("Number of flags= ",nrow(subset(proc_birthdates_ext_data,flag==TRUE))))Code
proc_birthdates_ext_data |>
dplyr::filter(grepl("2.1.1.a.",obs )) |>
(\(df) {
print(message(paste0("2.1.1.a., Entries: ", nrow(df))))
print(message(paste0("2.1.1.a., RUNs: ", tidytable::distinct(df, hash_key) |> nrow())))
})()Code
proc_birthdates_ext_data |>
dplyr::filter(grepl("2.1.1.b.",obs )) |>
(\(df) {
print(message(paste0("2.1.1.b., Entries: ", nrow(df))))
print(message(paste0("2.1.1.b., RUNs: ", tidytable::distinct(df, hash_key) |> nrow())))
})()Code
proc_birthdates_ext_data |>
dplyr::filter(grepl("2.1.1.c.",obs )) |>
(\(df) {
print(message(paste0("2.1.1.c., Entries: ", nrow(df))))
print(message(paste0("2.1.1.c., RUNs: ", tidytable::distinct(df, hash_key) |> nrow())))
})()Code
proc_birthdates_ext_data |>
dplyr::filter(grepl("2.1.1.a.",obs ), grepl("2.1.1.b.",obs )) |>
(\(df) {
print(message(paste0("2.1.1.a. & 2.1.1.b., Entries: ", nrow(df))))
print(message(paste0("2.1.1.a. & 2.1.1.b., RUNs: ", tidytable::distinct(df, hash_key) |> nrow())))
})()Code
proc_birthdates_ext_data |>
dplyr::filter(grepl("2.1.1.a.",obs ), grepl("2.1.1.c.",obs )) |>
(\(df) {
print(message(paste0("2.1.1.a. & 2.1.1.c., Entries: ", nrow(df))))
print(message(paste0("2.1.1.a. & 2.1.1.c., RUNs: ", tidytable::distinct(df, hash_key) |> nrow())))
})()Code
proc_birthdates_ext_data |>
dplyr::filter(grepl("2.1.1.b.",obs ), grepl("2.1.1.c.",obs )) |>
(\(df) {
print(message(paste0("2.1.1.b. & 2.1.1.c., Entries: ", nrow(df))))
print(message(paste0("2.1.1.b. & 2.1.1.c., RUNs: ", tidytable::distinct(df, hash_key) |> nrow())))
})()Code
# 2.1.1.a., Entries: 1274
# 2.1.1.a., RUNs: 283
# 2.1.1.b., Entries: 11488
# 2.1.1.b., RUNs: 1748
# 2.1.1.c., Entries: 7255
# 2.1.1.c., RUNs: 1568
# 2.1.1.a. & 2.1.1.b., Entries: 1238
# 2.1.1.a. & 2.1.1.b., RUNs: 275
# 2.1.1.a. & 2.1.1.c., Entries: 36
# 2.1.1.a. & 2.1.1.c., RUNs: 8
# 2.1.1.b. & 2.1.1.c., Entries: 0
# 2.1.1.b. & 2.1.1.c., RUNs: 0
wdpath<-
paste0(gsub("/cons","",gsub("cons","",paste0(getwd(),"/cons"))))
envpath<- if(regmatches(wdpath, regexpr("[A-Za-z]+", wdpath))=="G"){"G:/Mi unidad/Alvacast/SISTRAT 2023/"}else{"E:/Mi unidad/Alvacast/SISTRAT 2023/"}
envpath
invisible("manual review of criteria to see consistency")
proc_birthdates_ext_data|>
tidytable::filter(hash_key %in% dplyr::pull(sample_n_with_seed(data.frame(hashs_inconsistent_birth_dates),100, seed=2125),1))|>
rio::export(file=paste0(wdpath,"cons/_out/inconsistent_birthdates_long_sample.csv"))
#every entry has less 16 years oldNULL
NULL
NULL
NULL
NULL
NULL
NULL
NULL
NULL
NULL
NULL
NULL
[1] "G:/Mi unidad/Alvacast/SISTRAT 2023/"
2.1.2. Inconsistent birth dates with no external data
However, there were cases with no external data. In these cases, we discarded birth dates prior to 1910 and earlier than 2010.
Code
plot_inconsistent_ages_noext_flowchart <- DiagrammeR::grViz("
digraph {
graph [layout = dot, rankdir = TB]
# Global node attributes
node [shape = box, fontname = Helvetica]
# Nodes
start [label = 'Start', shape = circle]
discard_dates [label = 'Discard birth dates prior to 1910-01-01\\nor after 2010-01-01']
check_valid_dates [label = 'Any valid birth dates remaining?', shape = diamond]
no_valid_dates [label = 'Set sel_birth_date to NA\\nFLAG: No valid birth dates']
avg_birth_date_po_available [label = 'Is any birth date from PO available?', shape = diamond]
select_closest_to_po [label = '2.1.2.a.Select date closest to PO']
create_frequency_table [label = 'Create frequency table of birth_dates']
single_most_frequent [label = 'Is there a single most frequent date?', shape = diamond]
select_most_frequent [label = '2.1.2.b.Select most frequent date']
tie_mfv_twoyrs [label = 'Differences greater than\n2 years between ties?', shape = diamond]
tie_most_frequent [label = '2.1.2.c.1.Ties among most\nfrequent dates with\ndifferences > 2 years\\nFLAG: Unresolved inconsistencies']
tie_most_frequent2 [label = '2.1.2.c.2.Ties among most\nfrequent dates with\ndifferences <= 2 years']
end [label = 'End', shape = circle]
# Connections
start -> discard_dates
discard_dates -> check_valid_dates
check_valid_dates -> no_valid_dates [label = 'No']
#no_valid_dates -> end
check_valid_dates -> avg_birth_date_po_available [label = 'Yes']
avg_birth_date_po_available -> select_closest_to_po [label = 'Yes']
select_closest_to_po -> end
avg_birth_date_po_available -> create_frequency_table [label = 'No']
create_frequency_table -> single_most_frequent
single_most_frequent -> select_most_frequent [label = 'Yes']
select_most_frequent -> end
single_most_frequent -> tie_mfv_twoyrs [label = 'No']
tie_mfv_twoyrs -> tie_most_frequent [label = 'No']
tie_mfv_twoyrs -> tie_most_frequent2 [label = 'Yes']
tie_most_frequent2 -> end
}
",
width = 800,
height = 900)
# desc = "Less16|More90, removed rows due to >2 |diff|",
# desc = "The most common date is selected as the birth date",
# desc = "Multiple common dates found. Select the birth date closest to available external records",
plot_inconsistent_ages_noext_flowchart
#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:
#:#:#:#:#:#:#:#:#:
invisible("Export database to explore it")
wdpath<-
paste0(gsub("/cons","",gsub("cons","",paste0(getwd(),"/cons"))))
envpath<- if(regmatches(wdpath, regexpr("[A-Za-z]+", wdpath))=="G"){"G:/Mi unidad/Alvacast/SISTRAT 2023/"}else{"E:/Mi unidad/Alvacast/SISTRAT 2023/"}
envpath
# WidthCM<-8
# HeightCM<-6
# DPI<-600
unlink(paste0(wdpath,"cons/_figs/inconsistent_ages_noext_flowchart_files"), recursive = TRUE)
htmlwidgets::saveWidget(plot_inconsistent_ages_noext_flowchart, paste0(wdpath,"cons/_figs/inconsistent_ages_noext_flowchart.html"))
webshot::webshot(paste0(wdpath,"cons/_figs/inconsistent_ages_noext_flowchart.html"),paste0(wdpath,"cons/_figs/inconsistent_ages_noext_flowchart.png"), vwidth = 300*1.2, vheight = 300, zoom=10, expand=100) # Prueba con diferentes coordenadas top, left, width, and height.Code
#https://stackoverflow.com/questions/1554635/graphviz-how-to-have-a-subgraph-be-left-to-right-when-main-graph-is-top-to-bot
#https://stackoverflow.com/questions/65509087/diagrammer-flowchart-align-vertical-nodes
#https://stackoverflow.com/questions/39451158/how-to-specify-vertical-alignment-of-nodes-in-r-package-diagrammer
#https://stackoverflow.com/questions/64323943/graphviz-and-dot-files-horizontal-and-vertical-node-alignment-intervening-node
#https://stackoverflow.com/questions/5424555/changing-edge-direction-in-dot
#https://graphviz.org/docs/attrs/rankdir/Workflow for rule-based selection of consistent birth dates (No ext. data)
[1] "G:/Mi unidad/Alvacast/SISTRAT 2023/"
Code
invisible("Discard birth dates previous or equal to 1910-01-01, or posterior to 2010-01-01")
invisible("IF PO available, select the date closer to PO in sel_birth_date")
invisible("If there is no PO available, select the most frequent date")
invisible("If ties among most frequent dates, FLAG it column FLAG as unresolvable inconsistency and return an NA in sel_birth_date")
invisible("IF PO available, select the date closer to PO in sel_birth_date")
# Function to process each group
process_group_noextinfo <- function(df_group) {
# Discard birth dates prior to 1910-01-01 or after 2010-01-01
df_group <- df_group|>
mutate(
birth_date = if_else( #requested in 2025-03-19
birth_date <= as.Date("1910-01-01") | birth_date >= as.Date("2010-01-01"),
as.Date(NA_character_),
birth_date
)
)
# Initialize 'sel_birth_date', 'FLAG', and 'obs'
df_group <- df_group|>
mutate(
sel_birth_date = as.Date(NA),
FLAG = NA,
obs = NA
)
# Extract necessary values
avg_birth_date_po <- unique(df_group$avg_birth_date_po[!is.na(df_group$avg_birth_date_po)])
# Remove NA values from birth_dates
birth_dates <- df_group$birth_date[!is.na(df_group$birth_date)]
# Rule 1: If 'avg_birth_date_po' is available, select the date closest to it
#Error en !is.na(avg_birth_date_po) && length(birth_dates) > 0: 'length = 2' in coercion to 'logical(1)'
if (length(avg_birth_date_po)>0 && length(birth_dates) > 0) {
# Rule 1: If 'avg_birth_date_po' is available, select the date closest to it
#if (!is.na(avg_birth_date_po) && length(birth_dates) > 0) {
# Calculate differences between birth_dates and avg_birth_date_po
diffs <- abs(as.numeric(difftime(birth_dates, avg_birth_date_po, units = "days")))
diffs[is.na(diffs)] <- Inf
if (all(is.infinite(diffs))) {
# All diffs are NA or Inf
df_group <- df_group |>
mutate(
sel_birth_date = as.Date(NA),
obs = "2.1.2.0.All diffs are NA or Inf",
FLAG = "Unable to select date closest to PO"
)
return(df_group)
}
sel_date <- birth_dates[which.min(diffs)]
df_group <- df_group |>
mutate(
sel_birth_date = sel_date,
obs = "2.1.2.a.Selected date closest to PO",
FLAG = NA
)
return(df_group)
}
# Rule 2: If no 'avg_birth_date_po', select the most frequent date
if (length(birth_dates) > 0) {
date_counts <- table(birth_dates)
max_count <- max(date_counts)
most_freq_dates <- as.Date(names(date_counts[date_counts == max_count]))
#most_freq_dates <- base::as.Date(names(date_counts[date_counts == max_count]), format = "%Y-%m-%d")
if (length(most_freq_dates) == 1) {
# No tie, select the most frequent date
sel_date <- most_freq_dates
df_group <- df_group|>
dplyr::mutate(
sel_birth_date = sel_date,
obs = "2.1.2.b.Selected most frequent date",
FLAG = NA
)
return(df_group)
} else {
# Tie among most frequent dates
# New Rule: Check if differences among tied dates are > 2 years
max_diff_days <- as.numeric(max(most_freq_dates) - min(most_freq_dates))
if (max_diff_days > 730) {
df_group <- df_group|>
mutate(
sel_birth_date = as.Date(NA),
FLAG = "Unresolvable inconsistency: Ties among most frequent dates",
obs = "2.1.2.c1.Ties among most frequent dates. Unresolvable inconsistency"
)
return(df_group)
} else {
# Differences <= 2 years, compute average date
sel_date_numeric <- mean(as.numeric(most_freq_dates))
sel_date <- base::as.Date(sel_date_numeric, origin = "1970-01-01")
df_group <- df_group %>%
mutate(
sel_birth_date = sel_date,
obs = "2.1.2.c2.Similar ties in dates, replaced with the average date",
FLAG = NA
)
return(df_group)
}
}
} else {
# No valid birth dates remaining
df_group <- df_group|>
mutate(
sel_birth_date = base::as.Date(NA),
FLAG = "No valid birth dates",
obs = "2.1.2.d.No valid birth dates"
)
return(df_group)
}
}
#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_
#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_
# Apply the function to every hash_key
# inconsistent_hashs_h_to_m_no_ex
proc_inconsistent_hashs_h_to_m_no_ext_long <-
inconsistent_hashs_h_to_m_no_ext_long |>
(\(df) {
print(message(paste0("Inconsistent birth dates with no external data, Entries: ", nrow(df))))
print(message(paste0("Inconsistent birth dates with no external data, RUNs: ", tidytable::distinct(df, hash_key) |> nrow())))
df
})()|>
dplyr::group_by(hash_key) |>
dplyr::group_split() |>
purrr::map_dfr(~ process_group_noextinfo(.x))Code
#Inconsistent birth dates with no external data, Entries: 1022
#Inconsistent birth dates with no external data, RUNs: 397
# debug(process_group_noextinfo)
#
# # run your problematic command:
# proc_inconsistent_hashs_h_to_m_no_ext_long <- inconsistent_hashs_h_to_m_no_ext_long |>
# dplyr::group_by(hash_key) |>
# dplyr::group_split() |>
# purrr::map_dfr(~ process_group_noextinfo(.x))
#
# undebug(process_group_noextinfo)
#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_
invisible("get numbers of resolutions")
proc_inconsistent_hashs_h_to_m_no_ext_long |>
dplyr::filter(grepl("2.1.2.0.",obs )) |>
(\(df) {
print(message(paste0("2.1.2.0., Entries: ", nrow(df))))
print(message(paste0("2.1.2.0., RUNs: ", tidytable::distinct(df, hash_key) |> nrow())))
})()Code
proc_inconsistent_hashs_h_to_m_no_ext_long |>
dplyr::filter(grepl("2.1.2.a.",obs )) |>
(\(df) {
print(message(paste0("2.1.2.a., Entries: ", nrow(df))))
print(message(paste0("2.1.2.a., RUNs: ", tidytable::distinct(df, hash_key) |> nrow())))
})()Code
proc_inconsistent_hashs_h_to_m_no_ext_long |>
dplyr::filter(grepl("2.1.2.b.",obs )) |>
(\(df) {
print(message(paste0("2.1.2.b., Entries: ", nrow(df))))
print(message(paste0("2.1.2.b., RUNs: ", tidytable::distinct(df, hash_key) |> nrow())))
})()Code
proc_inconsistent_hashs_h_to_m_no_ext_long |>
dplyr::filter(grepl("2.1.2.c1.",obs )) |>
(\(df) {
print(message(paste0("2.1.2.c1., Entries: ", nrow(df))))
print(message(paste0("2.1.2.c1., RUNs: ", tidytable::distinct(df, hash_key) |> nrow())))
})()Code
proc_inconsistent_hashs_h_to_m_no_ext_long |>
dplyr::filter(grepl("2.1.2.c2.",obs )) |>
(\(df) {
print(message(paste0("2.1.2.c2., Entries: ", nrow(df))))
print(message(paste0("2.1.2.c2., RUNs: ", tidytable::distinct(df, hash_key) |> nrow())))
})()Code
proc_inconsistent_hashs_h_to_m_no_ext_long |>
dplyr::filter(grepl("2.1.2.d.",obs )) |>
(\(df) {
print(message(paste0("2.1.2.d., Entries: ", nrow(df))))
print(message(paste0("2.1.2.d., RUNs: ", tidytable::distinct(df, hash_key) |> nrow())))
})()Code
# 2.1.2.0., Entries: 0
# 2.1.2.0., RUNs: 0
# 2.1.2.a., Entries: 921
# 2.1.2.a., RUNs: 353
# 2.1.2.b., Entries: 30
# 2.1.2.b., RUNs: 10
# 2.1.2.c1., Entries: 26
# 2.1.2.c1., RUNs: 12
# 2.1.2.c2., Entries: 45
# 2.1.2.c2., RUNs: 22
# 2.1.2.d., Entries: 0
# 2.1.2.d., RUNs: 0NULL
NULL
NULL
NULL
NULL
NULL
NULL
NULL
NULL
NULL
NULL
NULL
NULL
NULL
We added the corrected birth date and admission age.
Code
# tidytable::as_tidytable(SISTRAT23_c1_2010_2022_df_prev00)|>
# tidytable::filter(grepl("c46caa3cd2c89a2222ce319cf6f5e98392f928e0544ee5487",hash_key)) |> glimpse()
SISTRAT23_c1_2010_2022_df_prev1d<-
SISTRAT23_c1_2010_2022_df_prev1c|>
tidylog::left_join(proc_birthdates_ext_data[,c("hash_key","sel_birth_date","obs")], by="hash_key", multiple="first")|>
tidylog::left_join(proc_inconsistent_hashs_h_to_m_no_ext_long[,c("hash_key","sel_birth_date","obs")], by="hash_key", multiple="first")|>
tidytable::mutate(birth_date= tidytable::case_when(!is.na(obs.x)~ as.Date(sel_birth_date.x), T~birth_date))|>
tidytable::mutate(birth_date= tidytable::case_when(!is.na(obs.y)~ as.Date(sel_birth_date.y), T~birth_date))|>
tidytable::mutate(birth_date= as.Date(birth_date))|>
tidytable::mutate(adm_yr= round(as.numeric((adm_date_rec-birth_date))/365.25,2))|>
tidytable::mutate(OBS = tidytable::case_when( !is.na(obs.x)~ glue("{OBS};{obs.x}"),T~OBS))|>
tidytable::mutate(OBS = tidytable::case_when( !is.na(obs.y)~ glue("{OBS};{obs.y}"),T~OBS))|>
tidytable::mutate(OBS= gsub("^;", "", OBS))|>
tidytable::select(-any_of(as.vector(outer(c("obs","sel_birth_date"), c(".x",".y"), FUN = paste, sep = ""))))|>
tidytable::as_tidytable()Code
message(paste0("Number of entries w/ infrequent (>90|<16) or missing admission ages= ",
tidytable::filter(SISTRAT23_c1_2010_2022_df_prev1d, adm_yr>90|adm_yr<16|is.na(adm_yr))|> nrow(),"\n(HASHs= ",
tidytable::filter(SISTRAT23_c1_2010_2022_df_prev1d, adm_yr>90|adm_yr<16|is.na(adm_yr))|> distinct(hash_key) |> nrow(),")"))Code
# Number of entries w/ infrequent (>90|<16) or missing admission ages= 287
# (HASHs= 273)However, infrequent admission ages (>90 or <16) persisted ((n= 283; HASHs= 271)), prompting us to explore other databases.
2.2. Invalid admission ages
Code
invisible("Select HASHs")
hashs_invalid_adm_age<-
tidytable::filter(SISTRAT23_c1_2010_2022_df_prev1d, adm_yr>90|adm_yr<16|is.na(adm_yr))|>
distinct(hash_key)|>
tidytable::pull(hash_key)
#a534782e4a670be9fe584f9b1b47a7f38a73f6b08f18aa7a7b4a45a7ba628dbf is in the vector
invisible("Get external databases for our HASHs with invalid admission ages")
invisible("======================================================")
#6d5f2fc8d4c835e227ac7f99c96f710c235b0415d95571a976b481f9170a4c34
invalid_adm_age_hashs_hosp<-
HOSP_filter_df|>
tidytable::filter(run %in% hashs_invalid_adm_age)|>
(\(df) {
print(message(paste0("Hospital, Entries: ", nrow(df))))
print(message(paste0("Hospital, RUNs: ", tidytable::distinct(df, run) |> nrow())))
df
})()|>
tidytable::distinct(run, fecha_nac)|>
tidytable::group_by(run)|>
tidytable::mutate(id = as.character(dplyr::row_number()))|>
tidytable::pivot_wider(names_from = id, values_from = fecha_nac,
names_prefix = "h_fechnac_")Code
invisible("Since this measure depended on the admission day and month, we calculated the average birth date.")
invalid_adm_age_hosp_avg<-
HOSP_filter_df|>
tidytable::filter(run %in% hashs_invalid_adm_age)|>
tidytable::distinct(run, fecha_nac)|>
tidytable::group_by(run)|>
tidytable::summarise(h_avg_birth_date = as.Date(mean(as.numeric(fecha_nac), na.rm=T), origin="1970-01-01"), ndis_birth_date= n_distinct(fecha_nac))|>
tidytable::ungroup()
# Hospital, Entries: 558
# NULL
# Hospital, RUNs: 152
# NULL
invisible("======================================================")
invalid_adm_age_top<-
SISTRAT23_top_2015_2022_df|>
tidytable::filter(HASH_KEY %in% hashs_invalid_adm_age)|>
(\(df) {
print(message(paste0("TOP, Entries: ", nrow(df))))
print(message(paste0("TOP, RUNs: ", tidytable::distinct(df, HASH_KEY)|> nrow())))
df
})()|>
tidytable::distinct(HASH_KEY, birth_date)|>
tidytable::ungroup()|>
tidytable::group_by(HASH_KEY)|>
tidytable::mutate(id = as.character(dplyr::row_number()))|> # Convertir `id` a carácter
tidytable::pivot_wider(names_from = id, values_from = birth_date,
names_prefix = "t_fechnac_")Code
# TOP, Entries: 170
# NULL
# TOP, RUNs: 56
# NULL
invisible("======================================================")
invalid_adm_age_c2<-
CONS_C2 |>
tidytable::filter(HASH_KEY %in% hashs_invalid_adm_age)|>
(\(df) {
print(message(paste0("C2, Entries: ", nrow(df))))
print(message(paste0("C2, RUNs: ", tidytable::distinct(df, HASH_KEY)|> nrow())))
df
})()|>
tidytable::distinct(HASH_KEY, birth_date)|>
tidytable::ungroup()|>
tidytable::group_by(HASH_KEY)|>
tidytable::mutate(id = as.character(dplyr::row_number()))|> # Convertir `id` a carácter
tidytable::pivot_wider(names_from = id, values_from = birth_date,
names_prefix = "c2_fechnac_")Code
# C2, Entries: 16
# NULL
# C2, RUNs: 5
# NULL
invisible("======================================================")
invalid_adm_age_c3<-
CONS_C3|>
tidytable::filter(HASH_KEY %in% hashs_invalid_adm_age)|>
(\(df) {
print(message(paste0("C3, Entries: ", nrow(df))))
print(message(paste0("C3, RUNs: ", tidytable::distinct(df, HASH_KEY)|> nrow())))
df
})()|>
tidytable::distinct(HASH_KEY, birth_date)|>
tidytable::ungroup()|>
tidytable::group_by(HASH_KEY)|>
tidytable::mutate(id = as.character(dplyr::row_number()))|> # Convertir `id` a carácter
tidytable::pivot_wider(names_from = id, values_from = birth_date,
names_prefix = "c3_fechnac_")Code
# C3, Entries: 4
# NULL
# C3, RUNs: 4
# NULL
invisible("======================================================")
invalid_adm_age_c4<-
CONS_C4 |>
tidytable::filter(HASH_KEY %in% hashs_invalid_adm_age)|>
(\(df) {
print(message(paste0("C4, Entries: ", nrow(df))))
print(message(paste0("C4, RUNs: ", tidytable::distinct(df, HASH_KEY)|> nrow())))
df
})()|>
tidytable::distinct(HASH_KEY, birth_date) |>
tidytable::ungroup()|>
tidytable::group_by(HASH_KEY)|>
tidytable::mutate(id = as.character(dplyr::row_number()))|> # Convertir `id` a carácter
tidytable::pivot_wider(names_from = id, values_from = birth_date,
names_prefix = "c4_fechnac_")Code
invisible("======================================================")
invalid_adm_age_c5<-
CONS_C5 |>
tidytable::filter(HASH_KEY %in% hashs_invalid_adm_age)|>
(\(df) {
print(message(paste0("C5, Entries: ", nrow(df))))
print(message(paste0("C5, RUNs: ", tidytable::distinct(df, HASH_KEY)|> nrow())))
df
})() |>
tidytable::distinct(HASH_KEY, birth_date)|>
tidytable::ungroup()|>
tidytable::group_by(HASH_KEY)|>
tidytable::mutate(id = as.character(dplyr::row_number()))|> # Convertir `id` a carácter
tidytable::pivot_wider(names_from = id, values_from = birth_date,
names_prefix = "c5_fechnac_")Code
invisible("======================================================")
invalid_adm_age_c6<-
CONS_C6 |>
tidytable::filter(HASH_KEY %in% hashs_invalid_adm_age)|>
(\(df) {
print(message(paste0("C6, Entries: ", nrow(df))))
print(message(paste0("C6, RUNs: ", tidytable::distinct(df, HASH_KEY)|> nrow())))
df
})()|>
tidytable::distinct(HASH_KEY, birth_date)|>
tidytable::ungroup()|>
tidytable::group_by(HASH_KEY)|>
tidytable::mutate(id = as.character(dplyr::row_number()))|> # Convertir `id` a carácter
tidytable::pivot_wider(names_from = id, values_from = birth_date,
names_prefix = "c6_fechnac_")Code
invisible("======================================================")
invalid_adm_age_mortality<-
mortality |>
tidytable::filter(hashkey %in% hashs_invalid_adm_age)|>
(\(df) {
print(message(paste0("Mortality, Entries: ", nrow(df))))
print(message(paste0("Mortality, RUNs: ", tidytable::distinct(df, hashkey)|> nrow())))
df
})()|>
tidytable::distinct(hashkey, birth_date)|>
tidytable::ungroup()|>
tidytable::rename("m_birthdate"="birth_date")Code
# Mortality, Entries: 19
# NULL
# Mortality, RUNs: 19
# NULL
invisible("======================================================")
invalid_adm_age_may23_PO_office<-
OLD_NEW_SISTRAT23_c1_2010_2022_df2|>
tidylog::right_join(Base_fiscalia_v2, by=c("HASH_KEY.y"="rut_enc_saf"))|>
tidytable::select("HASH_KEY.x","HASH_KEY.y", "sexo.y","avg_birth_date_po")|>
tidytable::filter(HASH_KEY.x %in% hashs_invalid_adm_age)|>
(\(df) {
print(message(paste0("PO Office, Entries: ", nrow(df))))
print(message(paste0("PO Office, RUNs: ", tidytable::distinct(df, HASH_KEY.x) |> nrow())))
df
})()|>
tidytable::distinct(HASH_KEY.x, avg_birth_date_po)|>
tidytable::ungroup() Code
#PO Office, Entries: 2488
#PO Office, RUNs: 212
invalid_adm_age_c1_2324<-
SISTRAT23_c1_2023_2024_df2 |>
tidytable::filter(hash_key %in% hashs_invalid_adm_age) |>
(\(df) {
print(message(paste0("C1 2023-24, Entries: ", nrow(df))))
print(message(paste0("C1 2023-24, RUNs: ", tidytable::distinct(df, hash_key) |> nrow())))
df
})() |>
tidytable::distinct(hash_key, birth_date) |>
tidytable::ungroup() |>
tidytable::group_by(hash_key) |>
tidytable::mutate(id = as.character(dplyr::row_number())) |>
tidytable::pivot_wider(names_from = id, values_from = birth_date,
names_prefix = "c1_2324_fechnac_")Code
invalid_adm_age_top_2324<-
top_2224 |>
tidytable::filter(hashkey %in% hashs_invalid_adm_age) |>
(\(df) {
print(message(paste0("TOP 2023-24, Entries: ", nrow(df))))
print(message(paste0("TOP 2023-24, RUNs: ", tidytable::distinct(df, hashkey) |> nrow())))
df
})() |>
tidytable::distinct(hashkey, fecha_nacimiento) |>
tidytable::ungroup() |>
tidytable::group_by(hashkey) |>
tidytable::mutate(id = as.character(dplyr::row_number())) |>
tidytable::pivot_wider(names_from = id, values_from = fecha_nacimiento,
names_prefix = "c1_2324_fechnac_")Code
invalid_adm_age_c2_2224<-
c2_2324 |>
tidytable::filter(hashkey %in% hashs_invalid_adm_age) |>
(\(df) {
print(message(paste0("C2 2023-24, Entries: ", nrow(df))))
print(message(paste0("C2 2023-24, RUNs: ", tidytable::distinct(df, hashkey) |> nrow())))
df
})() |>
tidytable::distinct(hashkey, fecha_nacimiento) |>
tidytable::ungroup() |>
tidytable::group_by(hashkey) |>
tidytable::mutate(id = as.character(dplyr::row_number())) |>
tidytable::pivot_wider(names_from = id, values_from = fecha_nacimiento,
names_prefix = "c2_2224_fechnac_")NULL
NULL
NULL
NULL
NULL
NULL
NULL
NULL
NULL
NULL
NULL
NULL
NULL
NULL
NULL
NULL
NULL
NULL
NULL
NULL
NULL
NULL
NULL
NULL
To have more alternative dates, we utilized the previous database (SISTRAT23_c1_2010_2022_df_prev1c) containing inconsistent birth dates. This allowed us to check if any replaced dates were valid, providing an alternative that could be used in this instance.
Code
invalid_adm_ages_previous_values<-
SISTRAT23_c1_2010_2022_df_prev1c|>
tidytable::filter(hash_key %in% hashs_invalid_adm_age)|>
tidylog::left_join(invalid_adm_age_hosp_avg, by=c("hash_key"="run"))|>
tidytable::select(hash_key, birth_date, h_avg_birth_date)|>
tidylog::left_join(invalid_adm_age_top, by=c("hash_key"="HASH_KEY"))|>
tidylog::left_join(invalid_adm_age_c2, by=c("hash_key"="HASH_KEY"))|>
tidylog::left_join(invalid_adm_age_c3, by=c("hash_key"="HASH_KEY"))|>
tidylog::left_join(invalid_adm_age_c4, by=c("hash_key"="HASH_KEY"))|>
tidylog::left_join(invalid_adm_age_c5, by=c("hash_key"="HASH_KEY"))|>
tidylog::left_join(invalid_adm_age_c6, by=c("hash_key"="HASH_KEY"))|>
tidylog::left_join(invalid_adm_age_mortality, by=c("hash_key"="hashkey"))|>
tidylog::left_join(invalid_adm_age_c1_2324, by=c("hash_key"="hash_key"))|>
tidytable::mutate_rowwise(
non_NA_count = rowSums(!is.na(across(h_avg_birth_date:c1_2324_fechnac_1)))
)|>
dplyr::filter(non_NA_count>0)|>
(\(df) {
print(message(paste0("Invalid birth date that have at least one external birth date, Entries: ", nrow(df))))
print(message(paste0("Invalid birth date that have at least one external birth date, RUNs: ", tidytable::distinct(df, hash_key)|> nrow())))
df
})() Code
#Invalid birth date that have at least one external birth date, Entries: 179
#Invalid birth date that have at least one external birth date, RUNs: 177
invalid_adm_ages_no_ext_data<-
SISTRAT23_c1_2010_2022_df_prev1c|>
tidytable::filter(hash_key %in% hashs_invalid_adm_age)|>
tidylog::left_join(invalid_adm_age_hosp_avg, by=c("hash_key"="run"))|>
tidytable::select(hash_key, birth_date, h_avg_birth_date)|>
tidylog::left_join(invalid_adm_age_top, by=c("hash_key"="HASH_KEY"))|>
tidylog::left_join(invalid_adm_age_c2, by=c("hash_key"="HASH_KEY"))|>
tidylog::left_join(invalid_adm_age_c3, by=c("hash_key"="HASH_KEY"))|>
tidylog::left_join(invalid_adm_age_c4, by=c("hash_key"="HASH_KEY"))|>
tidylog::left_join(invalid_adm_age_c5, by=c("hash_key"="HASH_KEY"))|>
tidylog::left_join(invalid_adm_age_c6, by=c("hash_key"="HASH_KEY"))|>
tidylog::left_join(invalid_adm_age_mortality, by=c("hash_key"="hashkey"))|>
tidylog::left_join(invalid_adm_age_c1_2324, by=c("hash_key"="hash_key"))|>
tidytable::mutate_rowwise(
non_NA_count = rowSums(!is.na(across(h_avg_birth_date:c1_2324_fechnac_1)))
)|>
dplyr::filter(non_NA_count==0)|>
(\(df) {
print(message(paste0("Invalid birth date without external birth dates, Entries: ", nrow(df))))
print(message(paste0("Invalid birth date without external birth dates, RUNs: ", tidytable::distinct(df, hash_key)|> nrow())))
df
})() Code
#Invalid birth date without external birth dates, Entries: 144 #148 #110
#Invalid birth date without external birth dates, RUNs: 113 #115 #96
#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#:#:
invisible("long format")
invalid_adm_ages_long<-
invalid_adm_ages_previous_values|>
tidytable::group_by(hash_key)|>
tidytable::mutate(source_birth= dplyr::row_number())|>
tidytable::ungroup()|>
tidytable::select(hash_key, source_birth, everything())|>
tidytable::mutate(source_birth= paste0("orig_",source_birth))|>
tidytable::select(hash_key, source_birth, birth_date)|>
#first add hospital data
tidytable::bind_rows(cbind.data.frame(hash_key= invalid_adm_age_hosp_avg$run, source_birth= rep("hosp_avg",times= nrow(invalid_adm_age_hosp_avg)), birth_date= invalid_adm_age_hosp_avg$h_avg_birth_date))|>
tidytable::bind_rows(cbind.data.frame(hash_key= reshape2::melt(invalid_adm_age_top, id="HASH_KEY")$HASH_KEY, source_birth= as.character(reshape2::melt(invalid_adm_age_top, id="HASH_KEY")$variable), birth_date= reshape2::melt(invalid_adm_age_top, id="HASH_KEY")$value))|>
tidytable::bind_rows(cbind.data.frame(hash_key= reshape2::melt(invalid_adm_age_c2, id="HASH_KEY")$HASH_KEY, source_birth= as.character(reshape2::melt(invalid_adm_age_c2, id="HASH_KEY")$variable), birth_date= reshape2::melt(invalid_adm_age_c2, id="HASH_KEY")$value))|>
tidytable::bind_rows(cbind.data.frame(hash_key= reshape2::melt(invalid_adm_age_c3, id="HASH_KEY")$HASH_KEY, source_birth= as.character(reshape2::melt(invalid_adm_age_c3, id="HASH_KEY")$variable), birth_date= reshape2::melt(invalid_adm_age_c3, id="HASH_KEY")$value))|>
tidytable::bind_rows(cbind.data.frame(hash_key= reshape2::melt(invalid_adm_age_c4, id="HASH_KEY")$HASH_KEY, source_birth= as.character(reshape2::melt(invalid_adm_age_c4, id="HASH_KEY")$variable), birth_date= reshape2::melt(invalid_adm_age_c4, id="HASH_KEY")$value))|>
tidytable::bind_rows(cbind.data.frame(hash_key= reshape2::melt(invalid_adm_age_c5, id="HASH_KEY")$HASH_KEY, source_birth= as.character(reshape2::melt(invalid_adm_age_c5, id="HASH_KEY")$variable), birth_date= reshape2::melt(invalid_adm_age_c5, id="HASH_KEY")$value))|> tidytable::bind_rows(cbind.data.frame(hash_key= reshape2::melt(invalid_adm_age_c6, id="HASH_KEY")$HASH_KEY, source_birth= as.character(reshape2::melt(invalid_adm_age_c6, id="HASH_KEY")$variable), birth_date= reshape2::melt(invalid_adm_age_c6, id="HASH_KEY")$value))|>
tidytable::bind_rows(cbind.data.frame(hash_key= reshape2::melt(invalid_adm_age_mortality, id="hashkey")$hashkey, source_birth= as.character(reshape2::melt(invalid_adm_age_mortality, id="hashkey")$variable), birth_date= reshape2::melt(invalid_adm_age_mortality, id="hashkey")$value))|>
tidytable::bind_rows(cbind.data.frame(hash_key= reshape2::melt(invalid_adm_age_c1_2324, id="hash_key")$hash_key, source_birth= as.character(reshape2::melt(invalid_adm_age_c1_2324, id="hash_key")$variable), birth_date= reshape2::melt(invalid_adm_age_c1_2324, id="hash_key")$value))|>
tidytable::arrange(hash_key, source_birth)|>
tidytable::filter(!is.na(birth_date))|>
(\(df) {
print(message(paste0("Long database of HASHs with invalid ages, Entries: ", nrow(df))))
print(message(paste0("Long database of HASHs with invalid ages, RUNs: ", tidytable::distinct(df, hash_key)|> nrow())))
df
})() Code
# Long database of HASHs with invalid ages, Entries: 415
# Long database of HASHs with invalid ages, RUNs: 177
# invalid_adm_ages_long$adm_date_rec<-NULLNULL
NULL
NULL
NULL
NULL
NULL
We considered for the evaluation of valid birth dates, the substance use onset ages, age at first admission, the times where the birth date was recorded the same as the admission date, and a flag column called rec_min_adm_yr with 1 values if the admission age was greater than 90 or lower than 3 years old.
Code
invalid_adm_age_PO_office<-
OLD_NEW_SISTRAT23_c1_2010_2022_df2|>
tidylog::right_join(Base_fiscalia_v2, by=c("HASH_KEY.y"="rut_enc_saf"))|>
tidytable::select("HASH_KEY.x","HASH_KEY.y", "sexo.y","avg_birth_date_po")|>
tidytable::filter(HASH_KEY.x %in% hashs_invalid_adm_age)|>
(\(df) {
print(message(paste0("PO Office, Entries: ", nrow(df))))
print(message(paste0("PO Office, RUNs: ", tidytable::distinct(df, HASH_KEY.x)|> nrow())))
df
})()|>
tidytable::distinct(HASH_KEY.x, avg_birth_date_po)|>
tidytable::ungroup()Code
# PO Office, Entries: 2488
# PO Office, RUNs: 212
summary_invalid_adm_ages_chars<-
tidytable::filter(SISTRAT23_c1_2010_2022_df_prev1d, hash_key %in% hashs_invalid_adm_age)|>
tidytable::summarise(avg_prim_subs_adm= mean(edad_inicio_sustancia_principal, na.rm=T), avg_subs_onset= mean(edad_inicio_consumo, na.rm=T), min_adm_yr= min(adm_yr, na.rm=T),sum_eq_bd_adm= sum(adm_date_rec==birth_date),.by="hash_key",.groups="drop")|>
tidytable::mutate(rec_min_adm_yr= ifelse(min_adm_yr>90|min_adm_yr<3,1,0)) Warning in min(adm_yr, na.rm = T): ningún argumento finito para min; retornando Inf
Warning in min(adm_yr, na.rm = T): ningún argumento finito para min; retornando Inf
Warning in min(adm_yr, na.rm = T): ningún argumento finito para min; retornando Inf
Warning in min(adm_yr, na.rm = T): ningún argumento finito para min; retornando Inf
Warning in min(adm_yr, na.rm = T): ningún argumento finito para min; retornando Inf
Warning in min(adm_yr, na.rm = T): ningún argumento finito para min; retornando Inf
Warning in min(adm_yr, na.rm = T): ningún argumento finito para min; retornando Inf
Warning in min(adm_yr, na.rm = T): ningún argumento finito para min; retornando Inf
Warning in min(adm_yr, na.rm = T): ningún argumento finito para min; retornando Inf
Warning in min(adm_yr, na.rm = T): ningún argumento finito para min; retornando Inf
Code
#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_
#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_
wdpath<-
paste0(gsub("/cons","",gsub("cons","",paste0(getwd(),"/cons"))))
envpath<- if(regmatches(wdpath, regexpr("[A-Za-z]+", wdpath))=="G"){"G:/Mi unidad/Alvacast/SISTRAT 2023/"}else{"E:/Mi unidad/Alvacast/SISTRAT 2023/"}
envpath
invisible("manual review of criteria to see invalid adm ages")
invalid_adm_ages_long_w_po_and_c1_info_orig_t <-
invalid_adm_ages_long|>
tidytable::left_join(invalid_adm_age_PO_office, c("hash_key"="HASH_KEY.x"))|>
tidytable::left_join(summary_invalid_adm_ages_chars, .by="hash_key")|>
tidytable::mutate(main= ifelse(grepl("^orig_|^t_", source_birth),1,0))|>
tidytable::group_by(hash_key,main)|>
tidytable::mutate(ndis_birth_date= n_distinct(birth_date))|>
tidytable::ungroup()|>
tidytable::filter(main==1)|>
#2024-11-17: it was selecting only distinct birth dates, now it will only select distinct birthdates within each hash_key
tidytable::group_by(hash_key)|>
tidytable::distinct(birth_date, .keep_all =T)|>
tidytable::ungroup()
invalid_adm_ages_long_w_po_and_c1_info_other <-
invalid_adm_ages_long|>
tidytable::left_join(invalid_adm_age_PO_office, c("hash_key"="HASH_KEY.x"))|>
tidytable::left_join(summary_invalid_adm_ages_chars, .by="hash_key")|>
tidytable::mutate(main= ifelse(grepl("^orig_|^t_", source_birth),1,0))|>
tidytable::group_by(hash_key,main)|>
tidytable::mutate(ndis_birth_date= n_distinct(birth_date))|>
tidytable::ungroup()|>
tidytable::filter(main==0)
invisible("If there are different values among TOP and C1, I kept every distinct ones to contrast w/ other sources")
invalid_adm_ages_long_w_po_and_c1_info<-
tidytable::bind_rows(invalid_adm_ages_long_w_po_and_c1_info_orig_t,
invalid_adm_ages_long_w_po_and_c1_info_other)|>
tidytable::arrange(hash_key, main, birth_date)
wdpath<-
paste0(gsub("/cons","",gsub("cons","",paste0(getwd(),"/cons"))))
envpath<- if(regmatches(wdpath, regexpr("[A-Za-z]+", wdpath))=="G"){"G:/Mi unidad/Alvacast/SISTRAT 2023/"}else{"E:/Mi unidad/Alvacast/SISTRAT 2023/"}
envpath
invalid_adm_ages_long_w_po_and_c1_info |> rio::export(file=paste0(wdpath,"cons/_out/invalid_adm_yrs_long.csv"))
#031b27e9f5191197cd7db1aa85232937e424c47bc24c0922e = adm_date= birth_date
#
#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_
#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_
#
invisible("W/o ext data, long format. Check if any of the alternative birth dates is closer to mean")
invalid_adm_ages_no_ext_data_long<-
invalid_adm_ages_no_ext_data|>
(\(df) {
print(message(paste0("Invalid or no admission ages, No external data, Entries: ", nrow(df))))
print(message(paste0("Invalid or no admission ages, No external data, RUNs: ", tidytable::distinct(df, hash_key)|> nrow())))
df
})() |>
tidytable::group_by(hash_key)|>
tidytable::mutate(source_birth= dplyr::row_number())|>
tidytable::ungroup()|>
tidytable::select(hash_key, source_birth, everything())|>
tidytable::mutate(source_birth= paste0("orig_",source_birth))|>
tidytable::select(hash_key, source_birth, birth_date) Code
#110
#96
invisible("Added with PO office records")
invalid_adm_ages_no_ext_data_long_w_po_and_c1_info <-
invalid_adm_ages_no_ext_data_long|>
tidytable::left_join(invalid_adm_age_PO_office, c("hash_key"="HASH_KEY.x"))|>
tidytable::left_join(summary_invalid_adm_ages_chars, .by="hash_key")|>
tidytable::mutate(main= ifelse(grepl("^orig_|^t_", source_birth),1,0))|>
tidytable::group_by(hash_key,main)|>
tidytable::mutate(ndis_birth_date= n_distinct(birth_date))|>
tidytable::ungroup()|>
tidytable::filter(main==1)|>
tidytable::distinct(birth_date, .keep_all =T)
wdpath<-
paste0(gsub("/cons","",gsub("cons","",paste0(getwd(),"/cons"))))
envpath<- if(regmatches(wdpath, regexpr("[A-Za-z]+", wdpath))=="G"){"G:/Mi unidad/Alvacast/SISTRAT 2023/"}else{"E:/Mi unidad/Alvacast/SISTRAT 2023/"}
envpath
invalid_adm_ages_no_ext_data_long_w_po_and_c1_info|>
rio::export(file=paste0(wdpath,"cons/_out/invalid_adm_yrs_long_w_ext_info.csv"))NULL
NULL
[1] "G:/Mi unidad/Alvacast/SISTRAT 2023/"
[1] "G:/Mi unidad/Alvacast/SISTRAT 2023/"
NULL
NULL
[1] "G:/Mi unidad/Alvacast/SISTRAT 2023/"
We need to consider that the oldest 1% were born on 1951-02-17, while the youngest 1% were born on 2000-05-17. Thus, the former could have been at most 71 years old, while the latter could have been at least 10 years old at admission. Any admission age outside that range will be considered anomalous. However, some extremely implausible birth dates were those on or after Jan. 1, 2010, or on or before Jan. 1, 1910. According to Tukey’s criteria ( \(Q_{1|3} +/- 1.5\times IQR\) ), outliers were identified as dates earlier than 1949-09-09 or later than 2011-05-27.
Code
#https://bergant.github.io/bpmn/
wdpath<-
paste0(gsub("/cons","",gsub("cons","",paste0(getwd(),"/cons"))))
envpath<- if(regmatches(wdpath, regexpr("[A-Za-z]+", wdpath))=="G"){"G:/Mi unidad/Alvacast/SISTRAT 2023/"}else{"E:/Mi unidad/Alvacast/SISTRAT 2023/"}
envpath
bpmn::bpmn(paste0(wdpath, "cons/_input/diagram_invalid_adm_ages.bpmn"))
htmlwidgets::saveWidget(bpmn::bpmn(paste0(wdpath, "cons/_input/diagram_invalid_adm_ages.bpmn")), paste0(wdpath,"cons/_figs/diagram_invalid_adm_ages.html"))
webshot::webshot(paste0(wdpath,"cons/_figs/diagram_invalid_adm_ages.html"),paste0(wdpath,"cons/_figs/diagram_invalid_adm_ages.png"), vwidth = 300*1.2, vheight = 300, zoom=10, expand=100) # Prueba con diferentes coordenadas top, left, width, and height.[1] "G:/Mi unidad/Alvacast/SISTRAT 2023/"
Workflow for rule-based selection of valid birth dates
We started selecting values for those cases with no other external official information provided (invalid_adm_ages_no_ext_data_long_w_po_and_c1_info). We ensured that birth dates were corrected accurately by considering both the proximity to the average birth date and the presence of anomalies based on conditional checks. Records requiring further attention were clearly flagged for potential probabilistic imputation. Cases where only one anomalous birth date exists are flagged, and imputation is recommended. For instances with multiple dates, the one closest to the average was selected. Records containing only anomalous birth dates are flagged, with imputation recommended. Lastly, records with no anomalies either retained the original date or used the date closest to the mean.
Code
#https://chatgpt.com/share/6738f80f-cd6c-8010-a7d1-cc8a5669bfcd
invisible("Define average birth date, and percentiles 1 and 99")
avg_birth_date<-mean(SISTRAT23_c1_2010_2022_df_prev1d$birth_date,na.rm=T)
q1_birth_date <- as.Date(quantile(unclass(SISTRAT23_c1_2010_2022_df_prev1d$birth_date), 0.01, na.rm=T), origin = '1970-01-01')
q99_birth_date <- as.Date(quantile(unclass(SISTRAT23_c1_2010_2022_df_prev1d$birth_date), 0.99, na.rm=T), origin = '1970-01-01')
avg_birth_date; q1_birth_date; q99_birth_date
invisible("Define anomalous admission ages")
# [1] "1980-01-12"
# 1%
# "1951-02-17"
# 99%
# "2000-05-17"
invalid_adm_ages_no_ext_data_long_w_po_and_c1_info$min_adm_yr_flag <- with(
invalid_adm_ages_no_ext_data_long_w_po_and_c1_info,
ifelse(min_adm_yr > 71 | min_adm_yr < 10, "anomalous", "")
)
cat("IQRs")
as.numeric(as.Date("2022-01-01")-as.Date(quantile(unclass(SISTRAT23_c1_2010_2022_df_prev1d$birth_date),.25, na.rm=T) - 1.5*IQR(unclass(SISTRAT23_c1_2010_2022_df_prev1d$birth_date), na.rm=T)))/365.25
as.numeric(as.Date("2022-01-01")-as.Date(quantile(unclass(SISTRAT23_c1_2010_2022_df_prev1d$birth_date),.75, na.rm=T) + 1.5*IQR(unclass(SISTRAT23_c1_2010_2022_df_prev1d$birth_date), na.rm=T)))/365.25
# [1] 72.31075
# [1] 10.59959
invisible("=======================================================")
invisible("Function to process cases with no ext. info.")
process_group_noextinfo <- function(df_group) {
# Initialize 'sel_birth_date' & 'FLAG' columns
df_group$sel_birth_date <- NA
df_group$FLAG <- NA
# Extract unique values & first if there is more than one
avg_birth_date_po <- unique(df_group$avg_birth_date_po)[1]
min_adm_yr_flag <- unique(df_group$min_adm_yr_flag)
ndis <- unique(df_group$ndis_birth_date)
# Obtain unique birth dates in the group
birth_dates <- unique(df_group$birth_date)
# Condition 1: Have a PO birth date & actual is anomalous
if (!is.na(avg_birth_date_po) & min_adm_yr_flag == "anomalous") {
df_group$sel_birth_date <- avg_birth_date_po
df_group$FLAG <- '2.2.1.a.Anomalous adm age, have a PO birth date, replace birth date'
}
# Condition 2: Have a PO birth date but is not anomalous
else if (!is.na(avg_birth_date_po) & min_adm_yr_flag == "") {
# Include avg_birth_date_po among candidate values
candidates <- c(birth_dates, avg_birth_date_po)
candidates <- candidates[!is.na(candidates)] # Eliminamos NAs
# Calculate absolute difference with the average birth date
diffs <- abs(as.numeric(candidates) - as.numeric(avg_birth_date))
# Select the closest birth date to the average
sel_date <- candidates[which.min(diffs)]
df_group$sel_birth_date <- sel_date
df_group$FLAG <- '2.2.1.b. Not anomalous adm age, have a PO birth date, replace birth date with the closest birth date to the average birth date'
}
# Condition 3: No PO birth date and anomalous
else if (is.na(avg_birth_date_po) & min_adm_yr_flag == "anomalous") {
# If ndis == 1
if (ndis == 1) {
df_group$FLAG <- '2.2.1.c.Only 1 anomalous birth date, imputation is recommended'
df_group$sel_birth_date <- NA
}
# IF ndis > 1 (more than one birth date in C1 database)
else if (ndis > 1) {
# Check if any date is between percentiles 1 & 99
valid_dates <- birth_dates[birth_dates >= q1_birth_date & birth_dates <= q99_birth_date]
if (length(valid_dates) > 0) {
# Select the closest date to the average
diffs <- abs(as.numeric(valid_dates) - as.numeric(avg_birth_date))
sel_date <- valid_dates[which.min(diffs)]
df_group$sel_birth_date <- sel_date
df_group$FLAG <- '2.2.1.d.>1 distinct date, selected birth date w/ values closer to average birth date'
} else {
df_group$FLAG <- '2.2.1.e.Only anomalous birth dates, imputation is recommended'
df_group$sel_birth_date <- NA
}
}
}
# Condition 4: No PO birth date & not anomalous
else if (is.na(avg_birth_date_po) & min_adm_yr_flag == "") {
# If ndis > 1
if (ndis > 1) {
# Select the closest date to the mean
diffs <- abs(as.numeric(birth_dates) - as.numeric(avg_birth_date))
sel_date <- birth_dates[which.min(diffs)]
df_group$sel_birth_date <- sel_date
df_group$FLAG <- '2.2.1.f.No anomalous birth dates, used the closest to the mean'
}
# IF ndis == 1 (only one birth date)
else if (ndis == 1) {
df_group$sel_birth_date <- birth_dates[1] # Only one birth date
df_group$FLAG <- '2.2.1.g.Only one birth date, no anomalous values, kept the original birth date'
}
}
return(df_group)
}
#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_
#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_
# Apply the function to every hash_key
proc_invalid_adm_ages_no_ext_data_long_w_po_and_c1_info <-
invalid_adm_ages_no_ext_data_long_w_po_and_c1_info|>
dplyr::group_split(hash_key)|>
purrr::map_dfr(~ process_group_noextinfo(.x))
#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_
invisible("h and g do not have values")
proc_invalid_adm_ages_no_ext_data_long_w_po_and_c1_info |>
dplyr::filter(grepl("2.2.1.a",FLAG )) |>
(\(df) {
print(message(paste0("2.2.1.a, Entries: ", nrow(df))))
print(message(paste0("2.2.1.a, RUNs: ", tidytable::distinct(df, hash_key) |> nrow())))
})()Code
proc_invalid_adm_ages_no_ext_data_long_w_po_and_c1_info |>
dplyr::filter(grepl("2.2.1.b",FLAG )) |>
(\(df) {
print(message(paste0("2.2.1.b, Entries: ", nrow(df))))
print(message(paste0("2.2.1.b, RUNs: ", tidytable::distinct(df, hash_key) |> nrow())))
})()Code
proc_invalid_adm_ages_no_ext_data_long_w_po_and_c1_info |>
dplyr::filter(grepl("2.2.1.c",FLAG )) |>
(\(df) {
print(message(paste0("2.2.1.c, Entries: ", nrow(df))))
print(message(paste0("2.2.1.c, RUNs: ", tidytable::distinct(df, hash_key) |> nrow())))
})()Code
proc_invalid_adm_ages_no_ext_data_long_w_po_and_c1_info |>
dplyr::filter(grepl("2.2.1.d",FLAG )) |>
(\(df) {
print(message(paste0("2.2.1.d, Entries: ", nrow(df))))
print(message(paste0("2.2.1.d, RUNs: ", tidytable::distinct(df, hash_key) |> nrow())))
})()Code
proc_invalid_adm_ages_no_ext_data_long_w_po_and_c1_info |>
dplyr::filter(grepl("2.2.1.e",FLAG )) |>
(\(df) {
print(message(paste0("2.2.1.e, Entries: ", nrow(df))))
print(message(paste0("2.2.1.e, RUNs: ", tidytable::distinct(df, hash_key) |> nrow())))
})()Code
proc_invalid_adm_ages_no_ext_data_long_w_po_and_c1_info |>
dplyr::filter(grepl("2.2.1.f",FLAG )) |>
(\(df) {
print(message(paste0("2.2.1.f, Entries: ", nrow(df))))
print(message(paste0("2.2.1.f, RUNs: ", tidytable::distinct(df, hash_key) |> nrow())))
})()Code
proc_invalid_adm_ages_no_ext_data_long_w_po_and_c1_info |>
dplyr::filter(grepl("2.2.1.g",FLAG )) |>
(\(df) {
print(message(paste0("2.2.1.g, Entries: ", nrow(df))))
print(message(paste0("2.2.1.g, RUNs: ", tidytable::distinct(df, hash_key) |> nrow())))
})()Code
# 2.2.1.a.Anomalous adm age, have a PO birth date, replace birth date
# 2.2.1.b. Not anomalous adm age, have a PO birth date, replace birth date with the closest birth date to the average birth date
# 2.2.1.c.Only 1 anomalous birth date, imputation is recommended
# 2.2.1.d.>1 distinct date, selected birth date w/ values closer to average birth date
# 2.2.1.e.Only anomalous birth dates, imputation is recommended
# 2.2.1.f.No anomalous birth dates, used the closest to the mean
# 2.2.1.g.Only one birth date, no anomalous values, kept the original birth date
# 2.2.1.a, Entries: 12 #55
# 2.2.1.a, RUNs: 12 $55
# 2.2.1.b, Entries: 2 #11
# 2.2.1.b, RUNs: 2 #11
# 2.2.1.c, Entries: 58 #15
# 2.2.1.c, RUNs: 58 #15
# 2.2.1.d, Entries: 64 #26
# 2.2.1.d, RUNs: 31 #12
# 2.2.1.e, Entries: 0 #0
# 2.2.1.e, RUNs: 0 #0
# 2.2.1.f, Entries: 0 #0
# 2.2.1.f, RUNs: 0 #0
# 2.2.1.g, Entries: 10 #1
# 2.2.1.g, RUNs: 10 #1
#8e3591364bcc3eb5630f2db00e6f15bd57c07604e7c60f6086151f81f38b3727, no lo codificó bien, debiese haber sido d.[1] "1980-01-12"
1%
"1951-02-17"
99%
"2000-05-17"
IQRs[1] 72.31075
[1] 10.59959
NULL
NULL
NULL
NULL
NULL
NULL
NULL
NULL
NULL
NULL
NULL
NULL
NULL
NULL
In our dataset, we encountered individuals with multiple birth dates from various sources, including external records such as hospital admission dates (hosp_avg). To improve data integrity and select the most reliable birth date for each individual, we applied a systematic set of rules, each labeled for reference (e.g., 2.2.2.a to 2.2.2.i). These required the removal of implausible birth dates, prioritizing reliable sources, analyzing multiple valid birth dates, handling single valid birth dates after removal, and fallback options.
Code
#https://chatgpt.com/c/6737c7a6-4318-8010-98f3-512061b56100
q1_birth_date <- as.Date(quantile(unclass(SISTRAT23_c1_2010_2022_df_prev1d$birth_date), 0.01, na.rm=T), origin = '1970-01-01')
q99_birth_date <- as.Date(quantile(unclass(SISTRAT23_c1_2010_2022_df_prev1d$birth_date), 0.99, na.rm=T), origin = '1970-01-01')
invisible("Define anomalous admission ages")
invalid_adm_ages_long_w_po_and_c1_info$min_adm_yr_flag <- with(
invalid_adm_ages_long_w_po_and_c1_info,
ifelse(min_adm_yr > 71 | min_adm_yr < 10, "anomalous", "")
)
invisible("=======================================================")
invisible("")
# Function to process each group
process_inv_birthdates_w_info <- function(df_group) {
# Initialize 'sel_birth_date', 'FLAG', and 'obs'
df_group <- df_group |>
mutate(sel_birth_date = as.Date(NA),
FLAG = NA,
obs = NA) |>
# AGS: eliminate incorrect dates
mutate(
birth_date = case_when(
birth_date >= as.Date("2010-01-01") ~ as.Date(NA),
birth_date <= as.Date("1910-01-01") ~ as.Date(NA),
TRUE ~ birth_date
)
)
# Extract unique values needed
m_birthdate <- unique(df_group$birth_date[df_group$source_birth == "m_birthdate"])
avg_birth_date_po <- unique(df_group$avg_birth_date_po)
#sum_eq_adm <- unique(df_group$sum_eq_adm)
min_adm_yr_flag <- unique(df_group$min_adm_yr_flag)
ndis_birth_date <- unique(df_group$ndis_birth_date)
birth_dates <- unique(df_group$birth_date)
# Remove NA values from birth_dates
birth_dates <- birth_dates[!is.na(birth_dates)]
# Rule 1: Prioritize 'm_birthdate' if available (obs_label = "2.2.2.a")
if (length(m_birthdate) > 0) {
df_group <- df_group %>%
mutate(sel_birth_date = m_birthdate,
obs = "2.2.2.a. Prioritized birthdate of mortality database")
return(df_group)
}
# Rule 2: Use 'avg_birth_date_po' if within valid range (obs_label = "2.2.2.b")
if (is.na(df_group$sel_birth_date[1]) & !is.na(avg_birth_date_po)) {
# Check if 'avg_birth_date_po' is within valid dates
if (avg_birth_date_po >= q1_birth_date & avg_birth_date_po <= q99_birth_date) {
df_group <- df_group %>%
mutate(sel_birth_date = avg_birth_date_po,
obs = "2.2.2.b. Used PO date if within percentiles 1 and 99")
return(df_group)
}
}
# Rule 3: Analyze multiple birth dates (obs_label = "2.2.2.c")
if (length(birth_dates) > 1) {
# Filter birth_dates within valid range
valid_birth_dates <- birth_dates[birth_dates >= q1_birth_date & birth_dates <= q99_birth_date]
if (length(valid_birth_dates) > 0) {
# Check if any of the valid_birth_dates come from 'hosp_avg' source
hosp_avg_dates <- df_group$birth_date[df_group$source_birth == "hosp_avg"]
valid_hosp_avg_dates <- intersect(valid_birth_dates, hosp_avg_dates)
# Use day and month from other sources
other_b_dates <- setdiff(valid_birth_dates, hosp_avg_dates)
# Proceed to select the most frequent date
# Select the most frequent date among valid_birth_dates
other_b_dates_mfv <- as.numeric(names(sort(-table(other_b_dates)))[1])
if (length(valid_hosp_avg_dates)>0 & length(other_b_dates)>0){
# Rule 7: Use 'hosp_avg' date for year information (obs_label = "2.2.2.g")
avg_hosp_date <- unique(valid_hosp_avg_dates)
# print(paste0("avg hosp date:",as.Date(avg_hosp_date[1])))
avg_hosp_year <- as.integer(format(as.Date(avg_hosp_date[1]), "%Y"))
# print(paste0("avg hosp year:",avg_hosp_year))
# print(paste0("other dates,mfv:",as.Date.numeric(other_b_dates_mfv)))
day_month <- format(as.Date.numeric(other_b_dates_mfv), "%m-%d")
new_birth_date_str <- paste(avg_hosp_year, day_month, sep = "-")
new_birth_date <- as.Date(new_birth_date_str, format = "%Y-%m-%d")
# # Check if new_birth_date is within valid range
# if (!is.na(new_birth_date) && new_birth_date >= q1_birth_date & new_birth_date <= q99_birth_date) {
df_group <- df_group %>%
mutate(sel_birth_date = new_birth_date,
obs = "2.2.2.g. Select hospital birth year, keep month & day of other sources")
return(df_group)
# }
}
# Proceed to select the most frequent date
# Select the most frequent date among valid_birth_dates
birth_date_counts <- table(valid_birth_dates)
max_count <- max(birth_date_counts)
most_freq_dates <- as.Date(names(birth_date_counts[birth_date_counts == max_count]))
# If there's a tie, select the date closest to average birth date
if (length(most_freq_dates) > 1) {
diffs <- abs(as.numeric(most_freq_dates) - as.numeric(avg_birth_date))
sel_date <- most_freq_dates[which.min(diffs)]
} else {
sel_date <- most_freq_dates
}
df_group <- df_group %>%
mutate(sel_birth_date = sel_date,
obs = "2.2.2.c.Multiple birth dates. Chose the most frequent. If ties, selected the closest to the average date")
return(df_group)
} else {
# No valid birth dates within the percentile range
# Proceed to select date closest to average birth date from all birth_dates
diffs <- abs(as.numeric(birth_dates) - as.numeric(avg_birth_date))
sel_date <- birth_dates[which.min(diffs)]
df_group <- df_group %>%
mutate(sel_birth_date = sel_date,
FLAG = 'Selected date closest to average birth date',
obs = "2.2.2.d.No valid birthdate. Select closest to average") # Rule 4 applied here
return(df_group)
}
}
# Rule 5: If only one birth date is available (obs_label = "2.2.2.e")
if (length(birth_dates) == 1) {
# Check if it's within valid range
if (birth_dates >= q1_birth_date & birth_dates <= q99_birth_date) {
df_group <- df_group %>%
mutate(sel_birth_date = birth_dates,
obs = "2.2.2.e. Single birth date available, selected")
return(df_group)
} else {
# Birth date is outside valid range, select it but flag it
sel_date <- birth_dates
df_group <- df_group %>%
mutate(sel_birth_date = sel_date,
FLAG = 'Single date outside valid range, selected as is',
obs = "2.2.2.f.Single date outside valid range, selected as is") # Rule 6 applied here
return(df_group)
}
}
# Rule 8: Select date closest to average birth date (obs_label = "2.2.2.h")
if (length(birth_dates) > 0) {
diffs <- abs(as.numeric(birth_dates) - as.numeric(avg_birth_date))
sel_date <- birth_dates[which.min(diffs)]
df_group <- df_group %>%
mutate(sel_birth_date = sel_date,
FLAG = 'Selected date closest to average birth date',
obs = "2.2.2.h.Selected date closest to average birth date")
return(df_group)
}
# Rule 9: Flag records with unresolvable inconsistencies (obs_label = "2.2.2.i")
df_group <- df_group %>%
mutate(FLAG = 'Unresolvable inconsistencies',
obs = "2.2.2.i.Unresolvable inconsistencies or extremely implausible birth dates")
return(df_group)
}
#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_
#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_
process_safe <- purrr::safely(process_inv_birthdates_w_info)
# Apply the function to each group and combine the results
proc_invalid_adm_ages_long_w_po_and_c1_info <- invalid_adm_ages_long_w_po_and_c1_info|>
dplyr::group_split(hash_key)|>
#purrr::map_dfr(~ process_safe(.x)$result)
purrr::map_dfr(process_inv_birthdates_w_info)
#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_
invisible("h and g do not have values")
proc_invalid_adm_ages_long_w_po_and_c1_info|>
dplyr::filter(grepl("2.2.2.a",obs ))|>
(\(df) {
print(message(paste0("2.2.2.a, Entries: ", nrow(df))))
print(message(paste0("2.2.2.a, RUNs: ", tidytable::distinct(df, hash_key) |> nrow())))
})()Code
proc_invalid_adm_ages_long_w_po_and_c1_info|>
dplyr::filter(grepl("2.2.2.b",obs ))|>
(\(df) {
print(message(paste0("2.2.2.b, Entries: ", nrow(df))))
print(message(paste0("2.2.2.b, RUNs: ", tidytable::distinct(df, hash_key) |> nrow())))
})()Code
proc_invalid_adm_ages_long_w_po_and_c1_info |>
dplyr::filter(grepl("2.2.2.c",obs )) |>
(\(df) {
print(message(paste0("2.2.2.c, Entries: ", nrow(df))))
print(message(paste0("2.2.2.c, RUNs: ", tidytable::distinct(df, hash_key) |> nrow())))
})()Code
proc_invalid_adm_ages_long_w_po_and_c1_info |>
dplyr::filter(grepl("2.2.2.d",obs )) |>
(\(df) {
print(message(paste0("2.2.2.d, Entries: ", nrow(df))))
print(message(paste0("2.2.2.d, RUNs: ", tidytable::distinct(df, hash_key) |> nrow())))
})()Code
proc_invalid_adm_ages_long_w_po_and_c1_info |>
dplyr::filter(grepl("2.2.2.e",obs )) |>
(\(df) {
print(message(paste0("2.2.2.e, Entries: ", nrow(df))))
print(message(paste0("2.2.2.e, RUNs: ", tidytable::distinct(df, hash_key) |> nrow())))
})()Code
proc_invalid_adm_ages_long_w_po_and_c1_info |>
dplyr::filter(grepl("2.2.2.f",obs )) |>
(\(df) {
print(message(paste0("2.2.2.f, Entries: ", nrow(df))))
print(message(paste0("2.2.2.f, RUNs: ", tidytable::distinct(df, hash_key) |> nrow())))
})()Code
proc_invalid_adm_ages_long_w_po_and_c1_info |>
dplyr::filter(grepl("2.2.2.g",obs )) |>
(\(df) {
print(message(paste0("2.2.2.g, Entries: ", nrow(df))))
print(message(paste0("2.2.2.g, RUNs: ", tidytable::distinct(df, hash_key) |> nrow())))
})()Code
proc_invalid_adm_ages_long_w_po_and_c1_info |>
dplyr::filter(grepl("2.2.2.h",obs )) |>
(\(df) {
print(message(paste0("2.2.2.h, Entries: ", nrow(df))))
print(message(paste0("2.2.2.h, RUNs: ", tidytable::distinct(df, hash_key) |> nrow())))
})()Code
proc_invalid_adm_ages_long_w_po_and_c1_info |>
dplyr::filter(grepl("2.2.2.i",obs )) |>
(\(df) {
print(message(paste0("2.2.2.i, Entries: ", nrow(df))))
print(message(paste0("2.2.2.i, RUNs: ", tidytable::distinct(df, hash_key) |> nrow())))
})()Code
# 2.2.2.a. Prioritized birthdate of mortality database
# 2.2.2.b. Used PO date if within percentiles 1 and 99
# 2.2.2.c.Multiple birth dates. Chose the most frequent. If ties, selected the closest to the average date
# 2.2.2.d.No valid birthdate. Select closest to average
# 2.2.2.e. Single birth date available, selected
# 2.2.2.f.Single date outside valid range, selected as is
# 2.2.2.g.Get birth date year from the average dates of hospital admissions
# 2.2.2.h.Selected date closest to average birth date
# 2.2.2.i.Unresolvable inconsistencies or extremely implausible birth dates
# 2.2.2.a, Entries: 54 #54
# NULL
# 2.2.2.a, RUNs: 18 #18
# NULL
# 2.2.2.b, Entries: 36 #232
# NULL
# 2.2.2.b, RUNs: 18 #122
# NULL
# 2.2.2.c, Entries: 11 #2
# NULL
# 2.2.2.c, RUNs: 5 #1
# NULL
# 2.2.2.d, Entries: 18 #14
# NULL
# 2.2.2.d, RUNs: 8 #6
# NULL
# 2.2.2.e, Entries: 183 #35
# NULL
# 2.2.2.e, RUNs: 92 #18
# NULL
# 2.2.2.f, Entries: 7 #7
# NULL
# 2.2.2.f, RUNs: 4 #4
# NULL
# 2.2.2.g, Entries: 31 #11
# NULL
# 2.2.2.g, RUNs: 13 #4
# NULL
# 2.2.2.h, Entries: 0 #0
# NULL
# 2.2.2.h, RUNs: 0 #0
# NULL
# 2.2.2.i, Entries: 19 #4
# NULL
# 2.2.2.i, RUNs: 19 #4
#b002ce96b750e7e32aa9d2bdb611520e938293b7221415784fdb2fceeac52a8f, opciones raras, o 1917 o 2002
#a534782e4a670be9fe584f9b1b47a7f38a73f6b08f18aa7a7b4a45a7ba628dbf
#a3f5934a1b72932a34d89c71aec939a327499373107718abb42f437b5bc94254 por qué sólo una
#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_NULL
NULL
NULL
NULL
NULL
NULL
NULL
NULL
NULL
NULL
NULL
NULL
NULL
NULL
NULL
NULL
NULL
NULL
We added the corrected birth date and admission age.
Code
invisible("Check any errors in conversion of invalid ages: no ext data")
proc_birthdates_ext_data |>
dplyr::inner_join(proc_invalid_adm_ages_no_ext_data_long_w_po_and_c1_info, "hash_key") |> nrow()
invisible("Check any errors in conversion of invalid ages: w/ ext data")
hashs_inconsistent_and_invalid_ages<-
proc_birthdates_ext_data |>
dplyr::inner_join(proc_invalid_adm_ages_long_w_po_and_c1_info, "hash_key", multiple="first")
#e3d18a65489e87325c1a4e59286283a211b7ef31505bc48a4f62956cc491f5d3 dice que fue el único caso (2.2.2.e. Single birth date available, selected)
invisible("a534782e4a670be9fe584f9b1b47a7f38a73f6b08f18aa7a7b4a45a7ba628dbf")
#a534782e4a670be9fe584f9b1b47a7f38a73f6b08f18aa7a7b4a45a7ba628dbf only dates below 16
#already noticed in earlier stages
SISTRAT23_c1_2010_2022_df_prev1e<-
SISTRAT23_c1_2010_2022_df_prev1d|>
tidylog::left_join(proc_invalid_adm_ages_no_ext_data_long_w_po_and_c1_info[,c("hash_key","sel_birth_date","FLAG")], by="hash_key", multiple="first")|>
tidytable::rename("obs_invalid_dates_noext"="FLAG")|>
tidylog::left_join(proc_invalid_adm_ages_long_w_po_and_c1_info[,c("hash_key","sel_birth_date","obs")], by="hash_key", multiple="first")|>
tidytable::rename("obs_invalid_dates_ext"="obs")|>
tidytable::mutate(birth_date_rec = tidytable::case_when( !is.na(obs_invalid_dates_noext)~ as.Date(sel_birth_date.x), T~birth_date))|>
tidytable::mutate(birth_date_rec = tidytable::case_when( !is.na(obs_invalid_dates_ext)~ as.Date(sel_birth_date.y), T~birth_date_rec))|>
tidytable::mutate(adm_yr_rec=round(as.numeric((adm_date_rec-birth_date_rec))/365.25,2))|>
tidytable::mutate(OBS = tidytable::case_when( !is.na(obs_invalid_dates_noext)~ glue("{OBS};{obs_invalid_dates_noext}"),T~OBS))|>
tidytable::mutate(OBS = tidytable::case_when( !is.na(obs_invalid_dates_ext)~ glue("{OBS};{obs_invalid_dates_ext}"),T~OBS))|>
tidytable::mutate(OBS= gsub("^;", "", OBS))|>
tidytable::select(-any_of(c("sel_birth_date.x","obs_invalid_dates_noext","sel_birth_date.y","obs_invalid_dates_ext")))|>
tidytable::as_tidytable()Code
message(paste0("Number of entries w/ infrequent admission ages (>90|<16) or missing data= ",
tidytable::filter(SISTRAT23_c1_2010_2022_df_prev1e, adm_yr_rec>90|adm_yr_rec<16|is.na(adm_yr_rec))|> nrow(),"\n(HASHs= ",
tidytable::filter(SISTRAT23_c1_2010_2022_df_prev1e, adm_yr_rec>90|adm_yr_rec<16|is.na(adm_yr_rec))|> distinct(hash_key) |> nrow(),")"))Code
# Number of entries w/ infrequent admission ages (>90|<16) or missing data= 114
# (HASHs= 114)
message(paste0("Number of entries w/ missing data only= ",
tidytable::filter(SISTRAT23_c1_2010_2022_df_prev1e, is.na(adm_yr_rec))|> nrow(),"\n(HASHs= ",
tidytable::filter(SISTRAT23_c1_2010_2022_df_prev1e, is.na(adm_yr_rec))|> distinct(hash_key) |> nrow(),")"))Code
#6d5f2fc8d4c835e227ac7f99c96f710c235b0415d95571a976b481f9170a4c34 has a missing date
tidytable::filter(SISTRAT23_c1_2010_2022_df_prev1e, adm_yr_rec>90|adm_yr_rec<16)|>
pull(adm_yr_rec)|>
hist(breaks=50, main= "Infrequent ages (>90|<16)", xlab= "Admission age (years)")[1] 0
For example, the ID a534782e4a670be9fe584f9b1b47a7f38a73f6b08f18aa7a7b4a45a7ba628dbf was associated only with age entries below 16 years old, also corroborated by external information.
Imputation methods were evaluated in the script hist_scripts/Duplicates_24_imp_methods.R. However, the imputation was not sufficiently precise —specifically, the imputed admission ages spanned nearly 10 years— so we chose not to use it. We discarded IDs with ages <13 and with missing ages.
Code
#hashs_invalid_adm_age
# We explored 3 types of imputation: using k-nearest neighbours, random forests and multiple imputation with chained equations. The variables used as candidates were: `"sexo"`, `"tipo_centro"`, `"tipo_de_plan"`, `"pais_nacimiento"`, `"se_trata_de_una_mujer_embarazada"`, `"escolaridad_ultimo_ano_cursado"`, `"sustancia_principal"`, `"edad_inicio_sustancia_principal"`, `"tiene_menores_de_edad_a_cargo"`, `"edad_inicio_consumo"`, `"numero_de_hijos"`, `"estado_conyugal"`, `"TABLE_rec2"`, `"numero_de_tratamientos_anteriores"`, `"usuario_de_tribunales_tratamiento_drogas"`.
#
# The following variables have specific characteristics that should be considered:
#
# - `tiene_menores_de_edad_a_cargo` (responsible for minors): missing data is only present before 2015.
# - `numero_de_hijos` (number of children): values greater than 11 are incorrect.
# - `usuario_de_tribunales_tratamiento_drogas` (drug treatment court user): contains more missing data in 2016.
# - `pais_nacimiento` (country of birth): this information starts being collected in 2016.
# - `se_trata_de_una_mujer_embarazada` (pregnant woman): substantial missing data, but it is evenly distributed across annual datasets.
invisible("Cases with inconsistencies in birth date due to modifications of rule based protocols")
hashs_inconsistent_ages_post_rule_based_imp<-
SISTRAT23_c1_2010_2022_df_prev1e|>
tidytable::group_by(hash_key)|>
tidytable::summarise(n = n_distinct(birth_date_rec))|>
tidytable::filter(n > 1)|>
pull(hash_key)
invisible("es 0")
# SISTRAT23_c1_2010_2022_df_prev1e |>
# dplyr::filter(hash_key %in% hashs_inconsistent_ages_post_rule_based_imp) |> View()
hashs_for_imputation_adm_birth_year<-
tidytable::filter(SISTRAT23_c1_2010_2022_df_prev1e, adm_yr_rec>90|adm_yr_rec<16|is.na(adm_yr_rec))|> distinct(hash_key)
SISTRAT23_c1_2010_2022_df_prev1e$birth_date_rec_imp<-NA
SISTRAT23_c1_2010_2022_df_prev1e$birth_date_rec_imp<-
ifelse(!is.na(SISTRAT23_c1_2010_2022_df_prev1e$adm_yr_rec),
SISTRAT23_c1_2010_2022_df_prev1e$birth_date_rec,
SISTRAT23_c1_2010_2022_df_prev1e$birth_date_rec_imp)
invisible("Create database")
dataset_with_na <- SISTRAT23_c1_2010_2022_df_prev1e
#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_
#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_
set.seed(2125)
#https://mayer79.github.io/missRanger/articles/missRanger.html
SISTRAT23_c1_2010_2022_df_prev1e_imp <- missRanger(
data = dataset_with_na,
formula = birth_date_rec_imp ~ sexo+ tipo_centro + tipo_de_plan + pais_nacimiento +
se_trata_de_una_mujer_embarazada + escolaridad_ultimo_ano_cursado +
sustancia_principal + edad_inicio_sustancia_principal +
tiene_menores_de_edad_a_cargo + edad_inicio_consumo +
numero_de_hijos + estado_conyugal+
TABLE_rec2+ numero_de_tratamientos_anteriores+ usuario_de_tribunales_tratamiento_drogas,
num.trees = 5e3,
pmm.k = 3, # Predictive mean matching
keep_forests = T,
returnOOB= T,
#mtry= function(p) max(3, trunc(p / 3)), # At least 3 or parameters/3, whichever is greater.
maxiter= 5e2,
verbose = 2,
seed= 2125,
#case.weights = rowSums(!is.na(SISTRAT23_c1_2010_2022_df_prev1f)) #pass case weights to the imputation models. For instance, this allows to reduce the contribution of rows with many missings
)
paste0("Best iter:", SISTRAT23_c1_2010_2022_df_prev1e_imp$best_iter)
paste0("Mtry: how many covariates are considered in each tree split: ", floor(sqrt(12)))
#Quick and balanced. More if there are many complexities to capture
#Reduce if there a few data of overadjustment
#OOB prediction error per iteration and variable (1 minus R-squared for regression)
#The default mtry in missRanger is sqrt(p), where p is the number of variables in the dataset.
#OOB prediction errors are quantified as 1 - R^2 for numeric variables, and as classification error otherwise. If a variable has been imputed only univariately, the value is 1.
#https://rdrr.io/cran/missRanger/man/missRanger.html
paste0("The model explains ",scales::percent(1-min(SISTRAT23_c1_2010_2022_df_prev1e_imp$mean_pred_errors), accuracy=.1), " of the variance. This is calculated using the out of bag samples in each tree split")
# [1] "The model explains 51.2% of the variance. This is calculated using the out of bag samples in each tree split"
#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_
#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_
set.seed(2125)
SISTRAT23_c1_2010_2022_df_prev1e_imp_kNN<-VIM::kNN(dataset_with_na, variable = c("birth_date_rec_imp"), dist_var=c("sexo", "tipo_centro", "tipo_de_plan", "pais_nacimiento","se_trata_de_una_mujer_embarazada", "escolaridad_ultimo_ano_cursado", "sustancia_principal", "edad_inicio_sustancia_principal", "tiene_menores_de_edad_a_cargo", "edad_inicio_consumo","numero_de_hijos", "estado_conyugal", "TABLE_rec2", "numero_de_tratamientos_anteriores", "usuario_de_tribunales_tratamiento_drogas"),
numFun = "mean",
k=3,
trace=T)
#Time difference of 1.000705 hoursCode
SISTRAT23_c1_2010_2022_df_prev1e_imp_kNN$birth_date_rec_imp
as.Date(SISTRAT23_c1_2010_2022_df_prev1e_imp_kNN$birth_date_rec_imp)
#2.2.1.c
#2.2.1.a
#2.2.2.f
#2.2.2.i
#Extremely implausible birth dates (on or after Jan. 1, 2010, or on or before Jan. 1, 1910)= puede que no haya casos con sólo implausibles. Y en ese caso no puedo hacer nada
invisible("HASHs that were filed for probabilistic imputation")
flowchart_red_for_imp<-
unique(c(
unique(SISTRAT23_c1_2010_2022_df_prev1e$hash_key[grepl("2.2.1.c",SISTRAT23_c1_2010_2022_df_prev1e$OBS)]),
unique(SISTRAT23_c1_2010_2022_df_prev1e$hash_key[grepl("2.2.1.e",SISTRAT23_c1_2010_2022_df_prev1e$OBS)]),
unique(SISTRAT23_c1_2010_2022_df_prev1e$hash_key[grepl("2.2.2.f",SISTRAT23_c1_2010_2022_df_prev1e$OBS)]),
unique(SISTRAT23_c1_2010_2022_df_prev1e$hash_key[grepl("2.2.2.i",SISTRAT23_c1_2010_2022_df_prev1e$OBS)])
))
invisible("Gross criteria for discarding HASHs")
discarded_hashs<-
tidytable::filter(SISTRAT23_c1_2010_2022_df_prev1e, adm_yr_rec>90|adm_yr_rec<16|is.na(adm_yr_rec))|> distinct(hash_key) |> pull(hash_key)
invisible("To explore which HASHs are not in the flowchart but fulfill the criteria for discard")
# SISTRAT23_c1_2010_2022_df_prev1f |>
# dplyr::filter(hash_key %in% setdiff(discarded_hashs, flowchart_red_for_imp)) |>
# janitor::tabyl(OBS, show_na = T) |>
# dplyr::arrange(desc(n)) |>
# dplyr::mutate(percent = round(percent*100, digits = 1))
# structure(list(OBS = structure(c("2.2.1.g.Only one birth date, no anomalous values, kept the original birth date",
# "1.1. Duplicated Cases in Almost Every Variable;2.2.2.d.No valid birthdate. Select closest to average",
# "2.2.2.g.Get birth date year from the average dates of hospital admissions",
# "2.2.2.d.No valid birthdate. Select closest to average", "",
# "1.1. Duplicated Cases in Almost Every Variable;2.2.2.b. Used PO date if within percentiles 1 and 99",
# "1.1. Duplicated Cases in Almost Every Variable;2.2.2.c.Multiple birth dates. Chose the most frequent. If ties, selected the closest to the average date",
# "2.2.2.b. Used PO date if within percentiles 1 and 99", "1.1. Duplicated Cases in Almost Every Variable;2.1.1.c.Multiple common dates found. Select the birth date closest to available external records;2.2.2.c.Multiple birth dates. Chose the most frequent. If ties, selected the closest to the average date",
# "1.1. Duplicated Cases in Almost Every Variable;2.2.1.b. Not anomalous adm age, have a PO birth date, replace birth date with the closest birth date to the average birth date",
# "1.1. Duplicated Cases in Almost Every Variable;2.2.2.e. Single birth date available, selected",
# "1.1. Duplicated Cases in Almost Every Variable;2.2.2.g.Get birth date year from the average dates of hospital admissions",
# "2.1.1.c.Multiple common dates found. Select the birth date closest to available external records;2.2.2.c.Multiple birth dates. Chose the most frequent. If ties, selected the closest to the average date",
# "2.2.1.b. Not anomalous adm age, have a PO birth date, replace birth date with the closest birth date to the average birth date",
# "2.2.2.a. Prioritized birthdate of mortality database", "2.2.2.c.Multiple birth dates. Chose the most frequent. If ties, selected the closest to the average date"
# ), class = c("glue", "character")), n = c(10L, 5L, 4L, 3L, 2L,
# 2L, 2L, 2L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L), percent = c(26.3,
# 13.2, 10.5, 7.9, 5.3, 5.3, 5.3, 5.3, 2.6, 2.6, 2.6, 2.6, 2.6,
# 2.6, 2.6, 2.6)), row.names = c(NA, -16L), class = c("tabyl",
# "data.frame"))
#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:
invisible("Ver caso 8f20a93917a8d893793661630272c312df4d429901dfcf4fe859b95f379e70d8, tiene una edad de 22 ademas de una mala")Code
SISTRAT23_c1_2010_2022_df_prev1f<-
SISTRAT23_c1_2010_2022_df_prev1e|>
(\(df) {
print(message(paste0("Discarding infrequent & missing adm ages before rule-based imp, Entries: ", nrow(df))))
print(message(paste0("Discarding infrequent & missing adm ages before rule-based imp, RUNs: ", tidytable::distinct(df, hash_key) |> nrow())))
df
})() |>
tidytable::filter(adm_yr_rec<=90, adm_yr_rec>=13, !is.na(adm_yr_rec)) |>
(\(df) {
print(message(paste0("Discarding infrequent & missing adm ages after rule-based imp, Entries: ", nrow(df))))
print(message(paste0("Discarding infrequent & missing adm ages after rule-based imp, RUNs: ", tidytable::distinct(df, hash_key) |> nrow())))
df
})()Code
# 150133 #150190
# 106227 #106284
#
invisible("Por tanto, sacamos a 21 usuarios")NULL
NULL
NULL
NULL
3. Standardization of Some Variables
In this stage, we implemented most of the recommendations made by SENDA’s professionals:
- Deleted cases categorized as Parole (n = 1)
- Collapse the different plan types (
plan_type) into the following: pg-pab, pg-pai, pg-pr, m-pab, m-pai, and m-pr - Declared the age of onset of drug use as invalid if it exceeded the current age (
age_subs_onset). - Declared the age of drug use for the main substance as invalid if inconsistent (
age_prim_subs_onset). - Changed the name of the variable of admission age from
adm_yr_rectoadm_age_rec - Defined early and late drop-outs within compliance with treatment (
tr_compliance): not completed- early [<90 days] discharge, not completed- late [>=90 days] discharge, early adm. discharge (missmisspelling due to misconduct), death, completed, referral, currently in. Cases that were classified as death as a cause of discharge were transformed into administrative discharge to ensure consistency with other cases. - Collapsed and standardized substances (
first_sub_used,primary_sub,second_sub1,second_sub2,second_sub3) into the following categories: hallucinogens (e.g., LSD, mushrooms), cocaine, amphetamine-type stimulants (e.g., methamphetamine, ecstasy), inhalants, marijuana, opioids (e.g., heroin, methadone, painkillers), cocaine paste base and crack, sedatives, hypnotics and tranquilizers, and others (e.g., anabolic steroids). Records without secondary substance info (CIP- Temporary Detention Center [Centro de internación provisoria] or CRC- Closed Regime Detention Center[Centro de Régimen Cerrado]), without consumption, or with an unspecified primary substance category were marked as missing values. - Collapsed marital status (
marital_status) into the following categories: married, cohabiting or in shared living arrangements (civil union or de facto), separated or divorced, single, and widowed. Annulled status was coded into divorced (due to the possibility of having children and small size ~.01%), and who do not answer were marked as missing values. - Collapsed occupational condition (
occupation_condition) into an occupational status composed of three categories: employed, unemployed, and inactive. - Assigned an occupational status (
occupation_status) only to records with occupational conditions marked as employed, leaving others as missing. Finally, standardized these categories into English labels. - Translated labels of Biopsychosocial compromise and ordered according to complexity (
biopsych_comp). - Translated labels of Drug dependence diagnosis (
sub_dep_icd10_status) - Tenure status of households were collapsed into Stays temporarily with a relative, Owner/Transferred dwellings/Pays Dividends, Illegal Settlement, and Others (~3.0%).
- Treatment admission motive (
adm_motive) was collapsed into Spontaneous consultation, Sanitary sector referral, Justice sector referral, SENDA-related (Another SUD center/FONODROGAS/ SENDA Previene), and Other (~4.6%) - Collapsed educational attainment into three categories (
ed_attainment): completed primary school or less, completed or incomplete high school, and more than high school. Categories such as education for students with disabilities were classified in completed primary school. The ‘Don’t know/No response’ categories were classified as invalid. - Declared the category “Unknown” for the route of administration of the main substance as an invalid value.
- Declared the category “Unknown” for the frequency of consumption of the primary substance (
prim_sub_freq) as an invalid value, added to translation of labels and collapsing “did not use” category with less than 1 day a week frequency into “less than than 1 day a week”. - Created a variable with english labels for the type of center (
type_center). - The commune/municipallity of residence was recoded to include the unique territorial code used before 2018 (
municipallity_res_cutpre18).
Code
municipality_map <- c('algarrobo'='algarrobo (6502)',
'alhue'='alhué (13502)',
'alto del carmen'='alto del carmen (3302)',
'alto hospicio'='alto hospicio (1107)',
'ancud'='ancud (10202)',
'andacollo'='andacollo (4103)',
'angol'='angol (9201)',
'antofagasta'='antofagasta (2101)',
'antuco'='antuco (8302)',
'arauco'='arauco (8202)',
'arica'='arica (15101)',
'aysen'='aysén (11201)',
'buin'='buin (13402)',
'bulnes'='bulnes * (8402)',
'cabildo'='cabildo (5402)',
'cabrero'='cabrero (8303)',
'calama'='calama (2201)',
'calbuco'='calbuco (10102)',
'caldera'='caldera (3102)',
'calera de tango'='calera de tango (13403)',
'calle larga'='calle larga (5302)',
'camarones'='camarones (15102)',
'camina'='camina (1402)',
'canela'='canela (4202)',
'canete'='canete (8203)',
'carahue'='carahue (9102)',
'cartagena'='cartagena (5603)',
'casablanca'='casablanca (5102)',
'castro'='castro (10201)',
'catemu'='catemu (5702)',
'cauquenes'='cauquenes (7201)',
'cerrillos'='cerrillos (13102)',
'cerro navia'='cerro navia (13103)',
'chanaral'='chanaral (3201)',
'chanco'='chanco (7202)',
'chepica'='chépica (6302)',
'chiguayante'='chiguayante (8103)',
'chile chico'='chile chico (11401)',
'chillan'='chillán * (8401)',
'chillan viejo'='chillán viejo * (8406)',
'chimbarongo'='chimbarongo (6303)',
'cholchol'='cholchol (9121)',
'chonchi'='chonchi (10203)',
'cisnes'='cisnes (11202)',
'cobquecura'='cobquecura * (8403)',
'cochamo'='cochamó (10103)',
'cochrane'='cochrane (11301)',
'codegua'='codegua (6102)',
'coelemu'='coelemu * (8404)',
'coihueco'='coihueco * (8405)',
'coinco'='coinco (6103)',
'colbun'='colbún (7402)',
'colchane'='colchane (1403)',
'colina'='colina (13301)',
'collipulli'='collipulli (9202)',
'coltauco'='coltauco (6104)',
'combarbala'='combarbalá (4302)',
'concepcion'='concepción (8101)',
'conchali'='conchalí (13104)',
'concon'='concón (5103)',
'constitucion'='constitución (7102)',
'contulmo'='contulmo (8204)',
'copiapo'='copiapó (3101)',
'coquimbo'='coquimbo (4102)',
'coronel'='coronel (8102)',
'corral'='corral (14102)',
'coyhaique'='coyhaique (11101)',
'cunco'='cunco (9103)',
'curacautin'='curacautín (9203)',
'curacavi'='curacaví (13503)',
'curaco de velez'='curaco de vélez (10204)',
'curanilahue'='curanilahue (8205)',
'curarrehue'='curarrehue (9104)',
'curepto'='curepto (7103)',
'curico'='curicó (7301)',
'dalcahue'='dalcahue (10205)',
'diego de almagro'='diego de almagro (3202)',
'donihue'='donihue (6105)',
'el bosque'='el bosque (13105)',
'el carmen'='el carmen * (8407)',
'el monte'='el monte (13602)',
'el quisco'='el quisco (5604)',
'el tabo'='el tabo (5605)',
'empedrado'='empedrado (7104)',
'ercilla'='ercilla (9204)',
'estacion central'='estación central (13106)',
'florida'='florida (8104)',
'freire'='freire (9105)',
'freirina'='freirina (3303)',
'fresia'='fresia (10104)',
'frutillar'='frutillar (10105)',
'futrono'='futrono (14202)',
'gorbea'='gorbea (9107)',
'graneros'='graneros (6106)',
'guaitecas'='guaitecas (11203)',
'hijuelas'='hijuelas (5503)',
'hualaihue'='hualaihué (10403)',
'hualane'='hualané (7302)',
'hualpen'='hualpén (8112)',
'hualqui'='hualqui (8105)',
'huara'='huara (1404)',
'huasco'='huasco (3304)',
'huechuraba'='huechuraba (13107)',
'illapel'='illapel (4201)',
'independencia'='independencia (13108)',
'iquique'='iquique (1101)',
'isla de maipo'='isla de maipo (13603)',
'isla de pascua'='isla de pascua (5201)',
'juan fernandez'='juan fernández (5104)',
'la calera'='la calera (5502)',
'la cisterna'='la cisterna (13109)',
'la cruz'='la cruz (5504)',
'la estrella'='la estrella (6202)',
'la florida'='la florida (13110)',
'la granja'='la granja (13111)',
'la higuera'='la higuera (4104)',
'la ligua'='la ligua (5401)',
'la pintana'='la pintana (13112)',
'la reina'='la reina (13113)',
'la serena'='la serena (4101)',
'la union'='la unión (14201)',
'lago ranco'='lago ranco (14203)',
'lago verde'='lago verde (11102)',
'laja'='laja (8304)',
'lampa'='lampa (13302)',
'lanco'='lanco (14103)',
'las cabras'='las cabras (6107)',
'las condes'='las condes (13114)',
'lautaro'='lautaro (9108)',
'lebu'='lebu (8201)',
'licanten'='licantén (7303)',
'limache'='limache (5802)',
'linares'='linares (7401)',
'litueche'='litueche (6203)',
'llanquihue'='llanquihue (10107)',
'llay llay'='llaillay (5703)',
'lo barnechea'='lo barnechea (13115)',
'lo espejo'='lo espejo (13116)',
'lo prado'='lo prado (13117)',
'lolol'='lolol (6304)',
'loncoche'='loncoche (9109)',
'longavi'='longaví (7403)',
'lonquimay'='lonquimay (9205)',
'los alamos'='los álamos (8206)',
'los andes'='los andes (5301)',
'los angeles'='los ángeles (8301)',
'los lagos'='los lagos (14104)',
'los muermos'='los muermos (10106)',
'los sauces'='los sauces (9206)',
'los vilos'='los vilos (4203)',
'lota'='lota (8106)',
'lumaco'='lumaco (9207)',
'machali'='machalí (6108)',
'macul'='macul (13118)',
'mafil'='máfil (14105)',
'maipu'='maipú (13119)',
'malloa'='malloa (6109)',
'marchigue'='marchihue (6204)',
'maria elena'='maría elena (2302)',
'maria pinto'='maría pinto (13504)',
'mariquina'='mariquina (14106)',
'maule'='maule (7105)',
'maullin'='maullín (10108)',
'mejillones'='mejillones (2102)',
'melipeuco'='melipeuco (9110)',
'melipilla'='melipilla (13501)',
'molina'='molina (7304)',
'monte patria'='monte patria (4303)',
'mulchen'='mulchén (8305)',
'nacimiento'='nacimiento (8306)',
'nancagua'='nancagua (6305)',
'navarino'='cabo de hornos (12201)',
'navidad'='navidad (6205)',
'negrete'='negrete (8307)',
'ninhue'='ninhue * (8408)',
'niquen'='niquén * (8409)',
'nogales'='nogales (5506)',
'nueva imperial'='nueva imperial (9111)',
'nunoa'='nunoa (13120)',
"o´higgins"="o'higgins (11302)",
'olivar'='olivar (6111)',
'ollagüe'='ollagüe (2202)',
'olmue'='olmué (5803)',
'osorno'='osorno (10301)',
'ovalle'='ovalle (4301)',
'padre hurtado'='padre hurtado (13604)',
'padre las casas'='padre las casas (9112)',
'paihuano'='paiguano (4105)',
'paillaco'='paillaco (14107)',
'paine'='paine (13404)',
'palena'='palena (10404)',
'palmilla'='palmilla (6306)',
'panguipulli'='panguipulli (14108)',
'panquehue'='panquehue (5704)',
'papudo'='papudo (5403)',
'paredones'='paredones (6206)',
'parral'='parral (7404)',
'pedro aguirre cerda'='pedro aguirre cerda (13121)',
'pelarco'='pelarco (7106)',
'pelluhue'='pelluhue (7203)',
'pemuco'='pemuco * (8410)',
'penaflor'='penaflor (13605)',
'penalolen'='penalolén (13122)',
'pencahue'='pencahue (7107)',
'penco'='penco (8107)',
'peralillo'='peralillo (6307)',
'perquenco'='perquenco (9113)',
'petorca'='petorca (5404)',
'peumo'='peumo (6112)',
'pica'='pica (1405)',
'pichidegua'='pichidegua (6113)',
'pichilemu'='pichilemu (6201)',
'pinto'='pinto * (8411)',
'pirque'='pirque (13202)',
'pitrufquen'='pitrufquén (9114)',
'placilla'='placilla (6308)',
'portezuelo'='portezuelo * (8412)',
'porvenir'='porvenir (12301)',
'pozo almonte'='pozo almonte (1401)',
'primavera'='primavera (12302)',
'providencia'='providencia (13123)',
'puchuncavi'='puchuncaví (5105)',
'pucon'='pucón (9115)',
'pudahuel'='pudahuel (13124)',
'puente alto'='puente alto (13201)',
'puerto montt'='puerto montt (10101)',
'puerto natales'='puerto natales (12401)',
'puerto octay'='puerto octay (10302)',
'puerto saavedra'='puerto saavedra (9116)',
'puerto varas'='puerto varas (10109)',
'pumanque'='pumanque (6309)',
'punitaqui'='punitaqui (4304)',
'punta arenas'='punta arenas (12101)',
'puqueldon'='puqueldón (10206)',
'puren'='purén (9208)',
'purranque'='purranque (10303)',
'putaendo'='putaendo (5705)',
'puyehue'='puyehue (10304)',
'queilen'='queilén (10207)',
'quellon'='quellón (10208)',
'quemchi'='quemchi (10209)',
'quilaco'='quilaco (8308)',
'quilicura'='quilicura (13125)',
'quilleco'='quilleco (8309)',
'quillon'='quillón * (8413)',
'quillota'='quillota (5501)',
'quilpue'='quilpué (5801)',
'quinchao'='quinchao (10210)',
'quinta de tilcoco'='quinta de tilcoco (6114)',
'quinta normal'='quinta normal (13126)',
'quintero'='quintero (5107)',
'quirihue'='quirihue * (8414)',
'rancagua'='rancagua (6101)',
'ranquil'='ránquil * (8415)',
'rauco'='rauco (7305)',
'recoleta'='recoleta (13127)',
'renaico'='renaico (9209)',
'renca'='renca (13128)',
'rengo'='rengo (6115)',
'requinoa'='requínoa (6116)',
'retiro'='retiro (7405)',
'rinconada'='rinconada (5303)',
'rio bueno'='río bueno (14204)',
'rio claro'='río claro (7108)',
'rio hurtado'='río hurtado (4305)',
'rio negro'='río negro (10305)',
'romeral'='romeral (7306)',
'sagrada familia'='sagrada familia (7307)',
'salamanca'='salamanca (4204)',
'san antonio'='san antonio (5601)',
'san bernardo'='san bernardo (13401)',
'san carlos'='san carlos * (8416)',
'san clemente'='san clemente (7109)',
'san esteban'='san esteban (5304)',
'san felipe'='san felipe (5701)',
'san fernando'='san fernando (6301)',
'san francisco de mostazal'='san francisco de mostazal (6110)',
'san gregorio de niquen'='san gregorio (12104)',
'san ignacio'='san ignacio * (8418)',
'san javier'='san javier (7406)',
'san joaquin'='san joaquín (13129)',
'san jose de maipo'='san josé de maipo (13203)',
'san juan de la costa'='san juan de la costa (10306)',
'san miguel'='san miguel (13130)',
'san nicolas'='san nicolas (8419)',
'san pablo'='san pablo (10307)',
'san pedro'='san pedro (13505)',
'san pedro de atacama'='san pedro de atacama (2203)',
'san pedro de la paz'='san pedro de la paz (8108)',
'san rafael'='san rafael (7110)',
'san ramon'='san ramón (13131)',
'san rosendo'='san rosendo (8310)',
'san vicente'='san vicente (6117)',
'santa barbara'='santa bárbara (8311)',
'santa cruz'='santa cruz (6310)',
'santa juana'='santa juana (8109)',
'santa maria'='santa maría (5706)',
'santiago centro'='santiago (13101)',
'santiago oeste'='santiago (13101)',
'santiago sur'='santiago (13101)',
'santo domingo'='santo domingo (5606)',
'sierra gorda'='sierra gorda (2103)',
'talagante'='talagante (13601)',
'talca'='talca (7101)',
'talcahuano'='talcahuano (8110)',
'taltal'='taltal (2104)',
'temuco'='temuco (9101)',
'teno'='teno (7308)',
'teodoro schmidt'='teodoro schmidt (9117)',
'tierra amarilla'='tierra amarilla (3103)',
'til-til'='tiltil (13303)',
'tirua'='tirúa (8207)',
'tocopilla'='tocopilla (2301)',
'tolten'='toltén (9118)',
'tome'='tomé (8111)',
'torres del paine'='torres del paine (12402)',
'tortel'='tortel (11303)',
'traiguen'='traiguén (9210)',
'trehuaco'='treguaco * (8420)',
'tucapel'='tucapel (8312)',
'valdivia'='valdivia (14101)',
'vallenar'='vallenar (3301)',
'valparaiso'='valparaíso (5101)',
'vichuquen'='vichuquén (7309)',
'victoria'='victoria (9211)',
'vicuna'='vicuna (4106)',
'vilcun'='vilcún (9119)',
'villa alegre'='villa alegre (7407)',
'villa alemana'='villa alemana (5804)',
'villarrica'='villarrica (9120)',
'vina del mar'='vina del mar (5109)',
'vitacura'='vitacura (13132)',
'yerbas buenas'='yerbas buenas (7408)',
'yumbel'='yumbel (8313)',
'yungay'='yungay * (8421)',
'zapallar'='zapallar (5405)')
SISTRAT23_c1_2010_2022_df_prev1g<-
SISTRAT23_c1_2010_2022_df_prev1f|>
#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:
#Deletion of Paroles and missing values
tidytable::filter(tipo_de_plan != "pai pv") |>
tidytable::filter(tipo_de_plan != "pai lv") |>
(\(df) {
print(message(paste0("Discarding parole & missing plans, Entries: ", nrow(df))))
print(message(paste0("Discarding parole & missing plans, RUNs: ", tidytable::distinct(df, hash_key) |> nrow())))
df
})()|>
dplyr::mutate(pub_center=factor(if_else(as.character(tipo_centro)=="publico",TRUE,FALSE,NA)))|>
#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:
#Collapsed treatment plans
tidytable::mutate(OBS= tidytable::case_when(tipo_de_plan %in% c("m pai (p)","m-pr2","pg pai 2", "pg pai 2", "otro", "calle") ~paste0(as.character(OBS),";","3.1. Collapsed Treatment Plans"), TRUE ~ as.character(OBS)))|>
tidytable::mutate(plan_type = tidytable::recode(tipo_de_plan,
"m pai (p)" = "m-pai",
"m-pai2" = "m-pai",
"m-pr2" = "m-pr",
"pg pai 2" = "pg-pai",
"otro" = "pg-pr",
"calle" = "pg-pr")) |>
#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:
#invalid age of substance use onset (vs. admission age)
tidytable::rename("adm_age_rec"="adm_yr_rec")|>
tidytable::mutate(age_subs_onset= ifelse(edad_inicio_consumo<= adm_age_rec, edad_inicio_consumo, NA))|>
tidytable::mutate(OBS=case_when(edad_inicio_consumo>adm_age_rec ~ paste0(as.character(OBS),";","3.2. Invalid Age Of Onset of Substance use, Higher than admission age"), TRUE ~ as.character(OBS)))|>
tidytable::mutate(age_prim_subs_onset= ifelse(edad_inicio_sustancia_principal<= adm_age_rec, edad_inicio_sustancia_principal, NA))|>
tidytable::mutate(OBS=case_when(edad_inicio_sustancia_principal>adm_age_rec ~ paste0(as.character(OBS),";","3.3. Invalid Age Of Onset of Primary Substance use, Higher than admission age"), TRUE ~ as.character(OBS)))|>
#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:
#invalid age of substance use onset (< 5 yrs old)
tidytable::mutate(OBS=case_when(age_subs_onset< 5 & age_subs_onset>=0 ~ paste0(as.character(OBS),";","3.4. Invalid Age Of Onset of Substance use, <5 yrs old"), TRUE ~ as.character(OBS)))|>
tidytable::mutate(OBS=case_when(age_prim_subs_onset< 5 & age_prim_subs_onset>=0 ~ paste0(as.character(OBS),";","3.5. Invalid Age Of Onset of Primary Substance use, <5 yrs old"), TRUE ~ as.character(OBS)))|>
tidytable::mutate(age_subs_onset= ifelse(age_subs_onset< 5 & age_subs_onset>=0, NA, age_subs_onset))|>
tidytable::mutate(age_prim_subs_onset= ifelse(age_prim_subs_onset< 5 & age_prim_subs_onset>=0, NA, age_prim_subs_onset))|>
#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:
#Negative tr. days
tidytable::mutate(OBS=case_when(dit_rec<0 ~ paste0(as.character(OBS),";","3.6. Negative Treatment Days, Changed Treat Days"),T ~ as.character(OBS)))|>
tidytable::mutate(dit_rec= ifelse(dit_rec<0, NA, dit_rec))|>
#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:
#Early vs. late dropout
tidytable::mutate(dit_earl_drop= ifelse(dit_rec>=90 & !is.na(dit_rec),0,1))|>
tidytable::mutate(dit_earl_drop= factor(dit_earl_drop, labels=c(">= 90 days","<90 days")))|>
#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:
#Treatment compliance
tidytable::mutate(motivo_de_egreso= ifelse(motivo_de_egreso=="muerte","alta adminsitrativa", motivo_de_egreso))|>
tidytable::mutate(tr_compliance= case_when(grepl("<",dit_earl_drop) & motivo_de_egreso=="abandono"~ "early dropout", grepl(">",dit_earl_drop) & motivo_de_egreso=="abandono"~ "late dropout", grepl("<",dit_earl_drop) & grepl("adm", motivo_de_egreso)~ "early adm discharge", grepl(">",dit_earl_drop) & grepl("adm", motivo_de_egreso)~ "late adm discharge", motivo_de_egreso=="alta terapeutica"~ "completion", motivo_de_egreso=="muerte"~ "death", motivo_de_egreso=="derivacion"~ "referral", is.na(motivo_de_egreso)~ "currently in", TRUE~ "other"))|>
#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:
#Substances (first used, primary and secondary), coding
tidytable::mutate(first_sub_used= sustancia_de_inicio, primary_sub= sustancia_principal, second_sub1= otras_sustancias_no1, second_sub2= otras_sustancias_no2, second_sub3= otras_sustancias_no3)|>
tidytable::mutate(OBS= case_when(grepl("especif|cip-crc|sin consumo", first_sub_used)|grepl("especif|cip-crc|sin consumo", primary_sub)|grepl("especif|cip-crc|sin consumo", second_sub1)| grepl("especif|cip-crc|sin consumo", second_sub2)|grepl("especif|cip-crc|sin consumo", second_sub3) ~ paste0(OBS,";","3.7. Secondary substances, invalid due to no consumption /unspecified"), TRUE ~ OBS))|>
tidytable::mutate(across(c("first_sub_used", "primary_sub", "second_sub1", "second_sub2", "second_sub3"), ~ tidytable::case_when(grepl("coca",.)~"cocaine powder", grepl("crack|pasta",.)~"cocaine paste", grepl("marihuana",.)~"marijuana", grepl("anfeta|extasis|fenil|estimul",.)~"amphetamine-type stimulants", grepl("alucin|lsd|hongos",.)~"hallucinogens", grepl("opi|hero|metadona",.)~"opioids", grepl("sedante|hipnotico|tranquiliz",.)~"tranquilizers/hypnotics", grepl("inhalable",.)~"inhalants", grepl("esteroid|otros",.)~"others", grepl("especif|cip-crc|sin consumo",.)~ NA_character_, TRUE~.)))|>
#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:
#Marital status
tidytable::mutate(marital_status= tidytable::case_when(grepl("casado|conviviente",estado_conyugal)~"married/cohabiting", grepl("separado|divorciado|anulado", estado_conyugal)~"separated/divorced/annulled", estado_conyugal=="soltero"~"single", estado_conyugal=="viudo"~"widowed", TRUE~NA_character_))|>
tidytable::mutate(OBS=case_when(grepl("contesta",estado_conyugal)~ paste0(OBS,";","3.8. Marital State, Invalid due to No Response"), TRUE ~ OBS))|>
#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:
#occupational condition= employed, unemployed, inactive
tidytable::mutate(occupation_condition= case_when(grepl("trabajando", condicion_ocupacional)~"employed", grepl("desempleado", condicion_ocupacional)~"unemployed", grepl("buscando|cesante", condicion_ocupacional)~"unemployed", TRUE~ "inactive"))|>
#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:
#Occupation status= collapsing and translating
tidytable::mutate(occupation_status= case_when(occupation_condition!="employed"~ NA_character_, T~categoria_ocupacional))|>
tidytable::mutate(occupation_status= case_when(occupation_status=="asalariado"~ "Salaried", occupation_status=="cuenta propia"~ "Self-employed", grepl("volunt",occupation_status)~ "Volunteer worker", grepl("familiar",occupation_status)~ "Unpaid family labour", grepl("otros",occupation_status)~ "Other", T~ occupation_status))|>
#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:
#Biopsychosocial compromise: translating and ordering
tidytable::mutate(biopsych_comp= case_when(compromiso_biopsicosocial=="leve"~'1-Mild', compromiso_biopsicosocial=="moderado"~ '2-Moderate', compromiso_biopsicosocial=="severo"~ '3-Severe', TRUE~ NA_character_))|>
#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:
#Substance dependency diagnoses : translating
tidytable::mutate(sub_dep_icd10_status= dplyr::case_when(grepl("perj",diagnostico_trs_consumo_sustancia)~"Hazardous consumption",grepl("dep",diagnostico_trs_consumo_sustancia)~"Drug dependence",TRUE~NA_character_))|>
#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:
#Tenure status of household: collapsing and translating
tidytable::mutate(tenure_status_household= case_when(tenencia_de_la_vivienda=="allegado"~"Stays temporarily with a relative", tenencia_de_la_vivienda=="arrienda"~"Renting", grepl("cedida|dividendo|propia", tenencia_de_la_vivienda)~ "Owner/Transferred dwellings/Pays Dividends", tenencia_de_la_vivienda=="Ocupación Irregular"~"Illegal Settlement", tenencia_de_la_vivienda=="otros"~"Others", T~NA_character_))|>
#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:
##Treatment Admission Motive: collapsing and translating
tidytable::mutate(adm_motive= case_when(origen_de_ingreso=="consulta espontanea"~"Spontaneous consultation", grepl("aps|red de salud", origen_de_ingreso)~"Sanitary sector referral", grepl("juzgado|fiscalia|vigilada", origen_de_ingreso)~"Justice sector referral", grepl("educacional|trabajo|servicios sociales|otros$", origen_de_ingreso)~"Other", grepl("educacional|trabajo|otro centro|fonodrogas|previene$", origen_de_ingreso)~"Another SUD center/FONODROGAS/SENDA Previene", T~NA_character_))|>
#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:
#Educational attainment: translated labels, collapsed into three categories and labelled as invalid "non responses" and ordered according to marginalization due to deprivation
#Labels: '1-More than high school','2-Completed high school or less','3-Completed primary school or less'
tidytable::mutate(ed_attainment= case_when(grepl("basica|basica|sin estudios|primaria|kinder|sala cuna|jardin|nunca|especial", escolaridad_ultimo_ano_cursado)~ "3-Completed primary school or less", grepl("media|cientifico|humanidades", escolaridad_ultimo_ano_cursado)~ "2-Completed high school or less", grepl("profesional|tecnica|tecnico|universitaria|doctorado|magister", escolaridad_ultimo_ano_cursado)~"1-More than high school"))|>
tidytable::mutate(OBS= case_when(grepl("no sabe", escolaridad_ultimo_ano_cursado)~paste0(OBS,";","3.9. Educational attainment, invalid due to non-response"), TRUE ~ OBS))|>
#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:
#Primary Substance at Admission Usage Frequency
tidytable::mutate(OBS=case_when(frecuencia_de_consumo_sustancia_principal=="desconocido" ~ paste0(as.character(OBS),";","3.10. Unknown report of primary substance use frequency"), TRUE ~ as.character(OBS)))|>
tidytable::mutate(prim_sub_freq= case_when(grepl("menos de|no consumio", frecuencia_de_consumo_sustancia_principal)~ "1. Less than 1 day a week", grepl("1 dias - semana", frecuencia_de_consumo_sustancia_principal)~"2. 1 day a week", grepl("2-3 dias", frecuencia_de_consumo_sustancia_principal)~ "3. 2 to 3 days a week", grepl("4-6 dias", frecuencia_de_consumo_sustancia_principal)~ "4. 4 to 6 days a week", grepl("todos", frecuencia_de_consumo_sustancia_principal)~ "5. Daily", frecuencia_de_consumo_sustancia_principal=="desconocido"~ NA_character_, T~ NA_character_))|>
#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:
#Type of center
tidytable::mutate(type_center= case_when(tipo_centro=="publico"~"public", tipo_centro=="privado"~"private", T~NA_character_))|>
#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:
#Primary Substance at Admission Usage Frequency
tidytable::mutate(prim_sub_route= dplyr::case_when(grepl("intranasal",via_administracion_sustancia_principal)~"Intranasal (powder aspiration)", grepl("fumada",via_administracion_sustancia_principal)~"Smoked or pulmonary aspiration", grepl("inyectada",via_administracion_sustancia_principal)~"Injected intravenously or intramuscularly", grepl("oral",via_administracion_sustancia_principal)~"Oral (drunk or eaten)", via_administracion_sustancia_principal=="otros"~ "Others", T~ NA_character_))|>
#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#: #Regional Location of Center: added territorial code previous to 2018 given the majority of data is before 2018
tidytable::mutate(municipallity_res_cutpre18= tidytable::recode(comuna_residencia, !!!municipality_map))Code
SISTRAT23_c1_2010_2022_df_prev1g$OBS <- sub("^;\\s*", "", SISTRAT23_c1_2010_2022_df_prev1g$OBS)NULL
NULL
4. Exploratory Probabilistic Deduplication
One of the main objectives of this stage of the project is to identify and separate each treatment for each user in a given time as a unique entity. This is shown in diagram of data preparation. The standardization of the age let us reduce a great amount of comparisons between every pair of records, making matching more feasible in terms of computational resources. For example, if we decided to compare each pair under consideration, we would have the number of cases (n= 150,187) multiplied by itself, leading to a total of 22,556,134,969 comparisons. To reach our objective, it was necessary to explore the principal causes that explain why or how a case matched with another. From duplicated cases, we knew how many records shared the same HASH and date of admission. But we needed to explore whether there would be other possible rules that would help to identify distinct treatments.
4.1 Perfect Duplicates of HASH and Date of Admission
Code
#create the duplicated dataset, following the recommendation to separate columns
duplicated_rows_concat <- data.frame(duplicated_HASH_date = duplicated(SISTRAT23_c1_2010_2022_df_prev1g[,c("hash_key","adm_date_rec")]),
row_dup_HASH_date = 1:nrow(SISTRAT23_c1_2010_2022_df_prev1g[,c("hash_key","adm_date_rec")])) #%>%
duplicated_rows_concat |> dplyr::filter(duplicated_HASH_date==TRUE) |> nrow()
data.table::as.data.table(SISTRAT23_c1_2010_2022_df_prev1g)[, dup_hash_date := .N, by = c("hash_key","adm_date_rec_num")] |> ##dim() #arroja 117,190 casos únicos. PERO CUIDADO: EN LOS QUE TIENEN 2, 3, 4, 5 Y MÁS, HAY CASOS QUE SON ÚNICOS TAMBIÉN (POR ESO UN DISTINCT NO LOS CAPTURA)
tidytable::group_by(dup_hash_date) |>
tidytable::summarise(n=n()) |>
tidytable::mutate(perc = round(n / sum(n),2)*100) |>
tidytable::mutate(perc = paste0(perc,"%")) |>
tidytable::mutate(Tot.Cases = n/dup_hash_date)|>
knitr::kable(format = "markdown", format.args = list(decimal.mark = ".", big.mark = ","),
caption="Times that the combination of HASH-Key & Date of Admission may appear in the dataset",
align =rep('c', 4))[1] 0
| dup_hash_date | n | perc | Tot.Cases |
|---|---|---|---|
| 1 | 150,187 | 100% | 150,187 |
In Table 13, we can see that there were no cases with at least one occurrence of the same combination of HASH-Key and date of admission. As done in Section 1.c, we resolved issues of this sort.
At this stage of the research, we needed to detect more complex patterns, in terms of cases with similar HASH-Key and date of admission. This approach is merely exploratory and aims to find cases with imperfect agreements on one or more of the variables. Once some variables are standardized, we would be able to use this approach to detect and replace values and erase duplicated cases. We ran data into a package in the software Stata called dtalink, with the following criteria:
- Hash Key: if matched, add 25 points; if not, subtract 25 points
- SENDA ID: if matched, add 25 points; if not, subtract 25 points
- Sex: if matched, add 10 points
- Center ID: if matched, add 10 points
- Date of Admission: if matched, add 30 points; if not, subtract 30 points. Also, we added a caliper of 5 days to still be considered as a match with a difference of 5 days or less.
- We added a Blocking variable of Birth year to reduce the time of computation and match each case within people with similar birth years (5 year-blocks).
- We consider a significant match if it accumulates at least 70 points.
Code
SISTRAT23_c1_2010_2022_df_prev1g <- SISTRAT23_c1_2010_2022_df_prev1g %>%
mutate(yr_block = floor(year(SISTRAT23_c1_2010_2022_df_prev1g$birth_date_rec) / 5) * 5)
# Perform matching with SQL query
matches_scored <- sqldf::sqldf('
SELECT
a.rn AS id_a,
b.rn AS id_b,
(CASE WHEN a.hash_key = b.hash_key THEN 25 ELSE -25 END) +
(CASE WHEN a.codigo_identificacion = b.codigo_identificacion THEN 25 ELSE -25 END) +
(CASE WHEN a.sexo = b.sexo THEN 10 ELSE 0 END) +
(CASE WHEN a.id_centro = b.id_centro THEN 10 ELSE 0 END) +
(CASE WHEN a.adm_date_rec = b.adm_date_rec THEN 30 ELSE -30 END) AS score,
ABS(JULIANDAY(a.adm_date_rec) - JULIANDAY(b.adm_date_rec)) AS date_diff,
-- Alias repeated columns to avoid ambiguity:
a.hash_key AS a_hash_key,
b.hash_key AS b_hash_key,
a.codigo_identificacion AS a_codigo_ident,
b.codigo_identificacion AS b_codigo_ident,
a.sexo AS a_sexo,
b.sexo AS b_sexo,
a.id_centro AS a_id_centro,
b.id_centro AS b_id_centro,
a.adm_date_rec AS a_adm_date_rec,
b.adm_date_rec AS b_adm_date_rec,
a.rn AS a_id_match,
b.rn AS b_id_match
FROM SISTRAT23_c1_2010_2022_df_prev1g a
JOIN SISTRAT23_c1_2010_2022_df_prev1g b
ON a_id_match < b_id_match
AND a.yr_block = b.yr_block
AND ABS(julianday(a.adm_date_rec) - julianday(b.adm_date_rec)) <= 15
WHERE a.adm_date_rec IS NOT NULL
AND b.adm_date_rec IS NOT NULL
')
# Perform matching with SQL query
matches_noscore <- sqldf::sqldf('
SELECT
a.rn AS id_a,
b.rn AS id_b,
-- Alias repeated columns to avoid ambiguity:
a.hash_key AS a_hash_key,
b.hash_key AS b_hash_key,
a.codigo_identificacion AS a_codigo_ident,
b.codigo_identificacion AS b_codigo_ident,
a.sexo AS a_sexo,
b.sexo AS b_sexo,
a.id_centro AS a_id_centro,
b.id_centro AS b_id_centro,
a.adm_date_rec AS a_adm_date_rec,
b.adm_date_rec AS b_adm_date_rec,
a.rn AS a_id_match,
b.rn AS b_id_match
FROM SISTRAT23_c1_2010_2022_df_prev1g a
JOIN SISTRAT23_c1_2010_2022_df_prev1g b
ON a_id_match < b_id_match
AND a.hash_key = b.hash_key
AND ABS(julianday(a.adm_date_rec) - julianday(b.adm_date_rec)) <= 15
WHERE a.adm_date_rec IS NOT NULL
AND b.adm_date_rec IS NOT NULL
')
#409
#
#id_match < b.id_match= ignores duplicated matches
# Filter pairs with at least 70 points
matches_scored_filtered <- matches_scored |>
filter(score >= 10)
#74
#remove total matches
rm(matches_scored)
#3490087For scored matches, only 346 had values over 10. We found 409 cases with the same HASH KEY, with 15 days of absolute difference in admission dates.
Code
matches_scored_filtered |>
rbind.data.frame(mutate(matches_scored_filtered, a_hash_key= b_hash_key)) |>
group_by(a_hash_key) |>
summarise(n= n()) |>
(\(df) {
print(message(paste0("Total: ", length(df$n)/2)))
print(message(paste0("1st quartile: ", quantile(df$n,.25))))
print(message(paste0("2nd quartile: ", quantile(df$n,.5))))
print(message(paste0("3rd quartile: ", quantile(df$n,.75))))
print(message(paste0("Percentile 97.5: ", round(quantile(df$n,.975),2))))
df
})() |>
pull(n) |>
hist(main= "Histogram of Scored Matches (>10 points)", xlab="Times present", breaks =20)Code
matches_noscore|>
rbind.data.frame(mutate(matches_noscore, a_hash_key= b_hash_key)) |>
group_by(a_hash_key) |>
summarise(n= n()) |>
(\(df) {
print(message(paste0("Total: ", length(df$n)/2)))
print(message(paste0("1st quartile: ", quantile(df$n,.25))))
print(message(paste0("2nd quartile: ", quantile(df$n,.5))))
print(message(paste0("3rd quartile: ", quantile(df$n,.75))))
print(message(paste0("Percentile 97.5: ", round(quantile(df$n,.975),2))))
df
})() |>
pull(n) |>
hist(main= "Histogram of non-scored matches of RUN & |10-day| difference in admission dates", xlab="Times present", breaks =20)NULL
NULL
NULL
NULL
NULL
NULL
NULL
NULL
NULL
NULL
Code
SISTRAT23_c1_2010_2022_df_prev1g |>
#select itnerscaled selected cases
tidytable::filter(rn %in% c(rbind(dplyr::pull(sample_n_with_seed(data.frame(matches_noscore),10, seed=2125),1), dplyr::pull(sample_n_with_seed(data.frame(matches_noscore),10, seed=2125),2)))) |>
select(TABLE, hash_key, id_centro, tipo_de_plan, senda, adm_date_rec , dit_rec, disch_date, tr_compliance, motivo_de_egreso_alta_administrativa) |>
dplyr::mutate(hash_key=as.numeric(factor(hash_key))) |>
knitr::kable(format = "html", format.args = list(decimal.mark = ".", big.mark = ","),
col.names= c("Date of retrieval", "HASH", "Center ID", "Plan type", "SENDA", "Admission date", "Days in treatment", "Discharge date", "Cause of discharge", "Motive of adm.discharge"),
caption="Table 15. Example of probabilistic matches (example with 10 HASHs)", align =rep('c', 101)) |>
kableExtra::scroll_box(width = "100%", height = "350px")
#matches_scored_filtered matches_noscore| Date of retrieval | HASH | Center ID | Plan type | SENDA | Admission date | Days in treatment | Discharge date | Cause of discharge | Motive of adm.discharge |
|---|---|---|---|---|---|---|---|---|---|
| 2019 | 1 | 163 | pg-pr | si | 2019-05-06 | 1 | 2019-05-07 | completion | |
| 2019 | 1 | 163 | pg-pr | si | 2019-05-08 | 62 | 2019-07-09 | early adm discharge | incumplimiento grave a las normas de convivencia del programa |
| 2012 | 2 | 295 | pg-pab | no | 2012-03-01 | 0 | 2012-03-01 | referral | |
| 2012 | 2 | 295 | pg-pab | si | 2012-03-02 | 28 | 2012-03-30 | early dropout | |
| 2019 | 3 | 221 | pg-pai | si | 2019-12-19 | 14 | 2020-01-02 | referral | |
| 2020 | 3 | 591 | pg-pai | si | 2020-01-03 | 301 | 2020-10-30 | completion | |
| 2019 | 4 | 258 | m-pr | si | 2019-05-10 | 3 | 2019-05-13 | referral | |
| 2019 | 4 | 262 | m-pai | si | 2019-05-22 | 90 | 2019-08-20 | late dropout | |
| 2014 | 5 | 165 | m-pai | si | 2014-09-25 | 5 | 2014-09-30 | referral | |
| 2015 | 5 | 165 | m-pr | si | 2014-10-08 | 107 | 2015-01-23 | late dropout | |
| 2011 | 6 | 181 | pg-pab | si | 2011-04-18 | 14 | 2011-05-02 | referral | |
| 2011 | 6 | 181 | pg-pai | si | 2011-05-03 | 121 | 2011-09-01 | referral | |
| 2015 | 7 | 194 | pg-pai | si | 2015-02-13 | 14 | 2015-02-27 | referral | |
| 2016 | 7 | 183 | pg-pr | si | 2015-02-27 | 398 | 2016-03-31 | completion | |
| 2010 | 8 | 291 | pg-pab | si | 2010-05-10 | 8 | 2010-05-18 | referral | |
| 2011 | 8 | 303 | pg-pr | si | 2010-05-19 | 286 | 2011-03-01 | completion | |
| 2013 | 9 | 209 | pg-pab | si | 2013-01-09 | 12 | 2013-01-21 | referral | |
| 2014 | 9 | 215 | pg-pr | si | 2013-01-18 | 373 | 2014-01-26 | completion | |
| 2018 | 10 | 469 | pg-pab | si | 2018-03-02 | 59 | 2018-04-30 | early dropout | |
| 2018 | 10 | 137 | pg-pai | si | 2018-03-13 | 171 | 2018-08-31 | referral |
In the example, we can notice that some cases shared center IDs, some had not a strict overlapped dates, and some had different reasons for discharge, suggesting a continuity in treatment episodes. This approach is useful to identify cases that may have been duplicated due to errors in the data entry. We will use this approach to identify and correct duplicated cases in the next stage of the project.
4.2 Overlappings
An analysis of duplicated events showed that many ranges between the dates of admission and discharge were overlapping due to referrals to other centers, principally by changes in the treatment center. However, to identify overlappings in treatments, it is necessary to obtain from other sources the missing dates as much as possible and clean the dates that may be incorrectly formatted.
Code
CONS_C1_df_dup_intervals <- SISTRAT23_c1_2010_2022_df_prev1g |>
rename(hash_key_2 = hash_key, row2 = rn) |>
select(
row2, hash_key_2, codigo_identificacion, TABLE_rec, adm_date_rec, disch_date, adm_date_rec_num, disch_date_num,
adm_age_rec, nombre_centro_rec, tr_compliance, senda
) |>
filter(tr_compliance != "referral") |>
data.table::as.data.table()
# overlap_dates_C1 <- janitor::clean_names(sqldf::sqldf("
# SELECT *
# FROM CONS_C1_df_dup_intervals x
# INNER JOIN CONS_C1_df_dup_intervals y
# ON x.hash_key_2 = y.hash_key_2
# AND x.row2 < y.row2 -- Avoids duplicates due to unique order
# AND x.adm_date_rec_num < y.disch_date_num
# AND x.disch_date_num > y.disch_date_num"))
# Find overlaps efficiently with duckdb
overlap_dates_C1 <- janitor::clean_names(sqldf::sqldf("
SELECT * -- no me interesa sólo de x, sino de ambos
FROM CONS_C1_df_dup_intervals x
INNER JOIN CONS_C1_df_dup_intervals y
ON x.hash_key_2 = y.hash_key_2
AND x.row2 < y.row2 -- Prevents duplicate pairs
AND x.adm_date_rec_num < y.disch_date_num -- Admitted before being admitted into another treatment
AND x.disch_date_num > y.adm_date_rec_num -- Discharge after being admitted in other
"))
#arrange(overlap_dates_C1, hash_key_2, adm_date_rec_num, row2)
if((group_by(overlap_dates_C1 , hash_key_2) |> mutate(n=n()) |> filter(n>1) |> nrow())>0){
warning("There are HASHs with more than one overlapping")
} else(message("HASHs with overlapping dates do not repeat themselves"))
#nrow(overlap_dates_C1) # pasamos de 1062 a
#394
# Plot
# overlap_dates_C1 adm_date_rec, disch_date
arrange(overlap_dates_C1, hash_key_2, adm_date_rec_num, row2) |>
ggplot() +
geom_segment(
aes(y = as.POSIXct(adm_date_rec), yend = as.POSIXct(disch_date),
x = hash_key_2, xend = hash_key_2), color="blue", alpha=.5, linewidth=1.3
) +
geom_segment(
aes(y = as.POSIXct(adm_date_rec_2), yend = as.POSIXct(disch_date_2),
x = hash_key_2, xend = hash_key_2), color="red", alpha=.5, linewidth=1.3
) +
coord_flip()+
# scale_x_datetime(
# breaks = scales::date_breaks("1 year"),
# limits = as.POSIXct(c('2000-01-01 09:00:00', '2023-01-01 09:00:00')),
# labels = scales::date_format("%m/%y")
# ) +
theme_minimal(base_size = 12) +
theme(
axis.text.y = element_blank(),
axis.ticks.y = element_blank(),
legend.position = "none",
panel.grid = element_blank(),
plot.caption = element_text(hjust = 0, face = "italic")
) +
labs(
x = "Dates of admission and discharge",
y = "",
caption = "Note. Only users that share characteristics and overlap between them"
)Figure 6.1 shows 394 record pairs that share the same HASH Key, but the date of admission is less than the date of discharge of another entry in the dataset, and the date of discharge is greater than the date of admission of that other case. It does not include derivation as a cause of the discharge. These conditions let us see how many cases overlap with another entry in the dataset. This graphic may seem a bit noisy because it covers all the overlapped cases, but we should look less at the black colored regions and more at the white areas between the lines to get an idea of the years that accumulate more overlappings.
4.3 Missing Dates of Discharge
Is not possible to report Table 16 because there is no treatment dates in this database.
Code
# row2, hash_key_2, codigo_identificacion, TABLE_rec, adm_date_rec, disch_date, adm_date_rec_num, disch_date_num,
# adm_age_rec, nombre_centro_rec, tr_compliance, senda
SISTRAT23_c1_2010_2022_df_prev1g|>
filter(is.na(disch_date))|>
mutate(dit_trans= as.numeric(as.Date("2023-04-28"))-adm_date_rec_num,
diff_treat_days= abs(dit_rec- dit_trans))|> #fecha del día de hoy
select(hash_key, codigo_identificacion, TABLE, sexo, adm_date_rec, disch_date, adm_date_rec_num, disch_date_num, dit_trans,dit_rec,diff_treat_days)|>
group_by(TABLE)|>
summarize(
n = n()
# mean_dit = mean(dit_trans, na.rm = TRUE),
# median_dit = median(dit_trans, na.rm = TRUE),
# sd_dit = sd(dit_trans, na.rm = TRUE),
# iqr_dit = IQR(dit_trans, na.rm = TRUE),
# min_dit = min(dit_trans, na.rm = TRUE),
# max_dit = max(dit_trans, na.rm = TRUE),
# p10_dit = quantile(dit_trans, 0.10, na.rm = TRUE),
# p25_dit = quantile(dit_trans, 0.25, na.rm = TRUE),
# p75_dit = quantile(dit_trans, 0.75, na.rm = TRUE),
# p90_dit = quantile(dit_trans, 0.90, na.rm = TRUE),
# mean_diff_treat = mean(diff_treat_days, na.rm = TRUE),
# median_diff_treat = median(diff_treat_days, na.rm = TRUE),
# sd_diff_treat = sd(diff_treat_days, na.rm = TRUE),
# iqr_diff_treat = IQR(diff_treat_days, na.rm = TRUE),
# min_diff_treat = min(diff_treat_days, na.rm = TRUE),
# max_diff_treat = max(diff_treat_days, na.rm = TRUE),
# p10_diff_treat = quantile(diff_treat_days, 0.10, na.rm = TRUE),
# p25_diff_treat = quantile(diff_treat_days, 0.25, na.rm = TRUE),
# p75_diff_treat = quantile(diff_treat_days, 0.75, na.rm = TRUE),
# p90_diff_treat = quantile(diff_treat_days, 0.90, na.rm = TRUE)
) |>
knitr::kable(format = "markdown", format.args = list(decimal.mark = ".", big.mark = ","),
caption="Missing Date of Discharge by Database year", align =rep('c', 2))| TABLE | n |
|---|---|
| 2010 | 16 |
| 2011 | 69 |
| 2012 | 11 |
| 2013 | 23 |
| 2014 | 16 |
| 2015 | 25 |
| 2016 | 55 |
| 2017 | 60 |
| 2018 | 118 |
| 2019 | 507 |
| 2020 | 41 |
| 2021 | 38 |
| 2022 | 3,916 |
Much of the databases of 2022 contain missing dates of discharge. This is a problem because it is necessary to know the date of discharge to calculate the days of treatment. As noticed, in 2022 there are many discharge dates. This may be due to the fact that the database is right-truncated. What is more intriguing is the amount of missing discharge dates in 2018 and 2019. This may be due to the fact that the database was right-truncated but from 2019 retrieval. We will need to review records with the same or similar admission dates and obtain the real missing discharge dates.
Code
miss_disch_dates<-
tidytable::filter(SISTRAT23_c1_2010_2022_df_prev1g, is.na(disch_date)) |>
(\(df) {
print(message(paste0("Missing discharge dates, Total: ", nrow(df))))
print(message(paste0("Missing discharge dates, Number of HASHs: ", tidytable::distinct(df, hash_key) |> nrow())))
df
})() NULL
NULL
As stated in the meeting of Jan. 13, 2020, an alternative would be to impute days of treatment and generate a new date of discharge by adding the days of treatment to the date of admission. We may look whether these cases to impute may have overlapping dates with other subsequent admissions.
Code
codebook::var_label(SISTRAT23_c1_2010_2022_df_prev1g) <- list(
TABLE = 'Origen de los Datos (de los archivos por año)/Source of Data (of files per year)',
hash_key = 'Codificación del RUN/Masked Identifier (RUN)',
codigo_identificacion = 'Código Identificación de SENDA/SENDAs ID',
nombre_centro = 'Nombre del Centro de Tratamiento/Treatment Center',
tipo_centro = 'Tipo de Centro/Type of Center',
region_del_centro = 'Región del Centro/Chilean Region of the Center',
servicio_de_salud = 'Servicio de Salud/Health Service',
tipo_de_programa = 'Tipo de Programa de Tratamiento/Type of Treatment Program',
tipo_de_plan = 'Tipo de Plan/Type of Plan',
senda = 'SENDA/SENDA',
dias_en_tratamiento = 'Días de Tratamiento/Days of Treatment',
n_meses_en_tratamiento = 'Número de Meses en Tratamiento/Number of Months in Treatment',
dias_en_senda = 'Días en SENDA/Days in SENDA',
n_meses_en_senda = 'Número de Meses en SENDA/Number of Months in SENDA',
sexo = 'Sexo del Usuario/Sex',
edad = 'Edad (número entero)/Age (Discrete Number)',
nombre_usuario = 'Nombre del Usuario (OCULTO y no accesible)/Name of the User (Not Accessible)',
comuna_residencia = 'Comuna de Residencia/Municipality of Residence',
origen_de_ingreso = 'Origen de Ingreso/Motive of Admission to Treatment',
pais_nacimiento = 'País de Nacimiento/Country of Birth',
nacionalidad = 'Nacionalidad/Nationality',
etnia = 'Etnia/Ethnicity',
estado_conyugal = 'Estado Conyugal/Marital Status',
numero_de_hijos = 'Número de Hijos/Number of Children',
numero_de_hijos_ingreso_tratamiento_residencial = 'Número de Hijos para Ingreso a Tratamiento Residencial/Number of Children to Residential Treatment',
parentesco_con_el_jefe_de_hogar = '(Sólo presenta valores perdidos)/',
numero_de_tratamientos_anteriores = 'Número de Tratamientos Anteriores/Number of Previous Treatments',
fecha_ultimo_tratamiento = 'Fecha del Último Tratamiento (aún no formateada como fecha)/Date of the Last Treatment',
sustancia_de_inicio = 'Sustancia de Inicio/Starting Substance',
edad_inicio_consumo = 'Edad de Inicio de Consumo/Age of Onset of Drug Use',
se_trata_de_una_mujer_embarazada = 'Mujer Embarazada al Ingreso/Pregnant at Admission',
escolaridad_ultimo_ano_cursado = 'Escolaridad: Nivel Educacional/Educational Attainment',
condicion_ocupacional = 'Condición Ocupacional/Occupational Condition',
categoria_ocupacional = 'Categoría Ocupacional/Occupational Category',
rubro_trabaja = 'Rubro de Trabajo/Area of Work',
con_quien_vive = 'Persona con la que vive el Usuario/People that Share Household with the User',
tipo_de_vivienda = 'Tipo de Vivienda/Type of Housing',
tenencia_de_la_vivienda = 'Tenencia de la Vivienda/Tenure status of Households',
sustancia_principal = 'Sustancia Principal/Main Substance',
otras_sustancias_no1 = 'Otras Sustancias (1)/Other Substances (1)',
otras_sustancias_no2 = 'Otras Sustancias (2)/Other Substances (2)',
otras_sustancias_no3 = 'Otras Sustancias (3)/Other Substances (3)',
frecuencia_de_consumo_sustancia_principal = 'Frecuencia de Consumo de la Sustancia Principal/Frequency of Consumption of the Main Substance',
edad_inicio_sustancia_principal = 'Edad de Inicio de Consumo Sustancia Principal/Age of Onset of Drug Use Principal Substance',
via_administracion_sustancia_principal = 'Vía de Administración de la Sustancia Principal/Route of Administration of the Main Substance',
diagnostico_trs_consumo_sustancia = 'Diagnóstico de Trastorno por Consumo de Sustancias/Diagnosis of Substance Use Disorder',
diagnostico_trs_psiquiatrico_dsm_iv = 'Diagnóstico Psiquiátrico, DSM IV/Diagnosis of Psychiatric Disorders, DSM-IV',
diagnostico_trs_psiquiatrico_sub_dsm_iv = 'Diagnóstico Psiquiátrico, DSM IV (Subclasificación)/Diagnosis of Psychiatric Disorders, DSM-IV criteria (subclassification)',
diagnostico_trs_psiquiatrico_cie_10 = 'Diagnóstico Psiquiátrico, CIE-10/Diagnosis of Psychiatric Disorders, CIE-10',
diagnostico_trs_psiquiatrico_sub_cie_10 = 'Diagnóstico Psiquiátrico, CIE-10 (Subclasificación)/Diagnosis of Psychiatric Disorders, CIE-10 criteria (subclassification)',
compromiso_biopsicosocial = 'Compromiso Biopsicosocial/Biopsychosocial Involvement',
fecha_ingreso_a_tratamiento = 'Fecha de Ingreso a Tratamiento/Date of Admission to Treatment',
fecha_egreso_de_tratamiento = 'Fecha de Egreso de Tratamiento/Date of Discharge from Treatment',
motivo_de_egreso = 'Motivo de Egreso/Cause of Discharge'
)5. Preliminary Summary in January 2020
Many selections for the purposes of the study are still being necessary until today, in order to keep the greater amount of information about each event.
Code
#knitr::include_graphics("G:/Mi unidad/Alvacast/SISTRAT 2019 (github)/SUD_CL/Figures/Figure_Duplicates.svg")
DiagrammeR::grViz("
digraph graph2 {
graph [layout = dot]
# node definitions with substituted label text
node [shape = rectangle, width = 4, color = 'steelblue',fillcolor = lightblue]
a [label = '@@1']
b [label = '@@2']
c [label = '@@3']
d [label = '@@4']
e [label = '@@5']
f [label = '@@6']
g [label = '@@7']
a -> b -> c -> {d e f g}
}
[1]: paste0('Once removed same values in >100 variables (n = ', formatC(nrow(SISTRAT23_c1_2010_2022_df2), format='f', big.mark=',', digits=0), ')')
[2]: paste0('Once removed same values in variables related to treatments and substance use (n = ', formatC(nrow(SISTRAT23_c1_2010_2022_df_prev1b), format='f', big.mark=',', digits=0), ')')
[3]: paste0('Preliminary Dataset (n = ', formatC(nrow(SISTRAT23_c1_2010_2022_df_prev1g), format='f', big.mark=',', digits=0), ')')
[4]: paste0('Same HASH &\\nDate of Admission (n = ', formatC(0, format='f', big.mark=',', digits=0), ')')
[5]: paste0('Overlapped Ranges\\nof Treatments (n = ', formatC(nrow(overlap_dates_C1), format='f', big.mark=',', digits=0), ')')
[6]: paste0('Pair of probabilistic scored matches (>10 points) &\\nnon-scored matches of RUN &\\n|10-day| difference in admission dates (n = ', formatC(nrow(matches_noscore)+nrow(matches_scored_filtered), format='f', big.mark=',', digits=0), ')')
[7]: paste0('Missing discharge dates \\n w/possible overlapping in subsequent\\ntreatments (n = ', formatC(nrow(tidytable::filter(SISTRAT23_c1_2010_2022_df_prev1g, is.na(disch_date))), format='f', big.mark=',', digits=0), ')')
")To close the project, we erase polars objects.
Code
rm(list = ls()[grepl("_pl$", ls())])Session info
Code
#|echo: true
#|error: true
#|message: true
#|paged.print: true
message(paste0("R library: ", Sys.getenv("R_LIBS_USER")))Code
message(paste0("Date: ",withr::with_locale(new = c('LC_TIME' = 'C'), code =Sys.time())))Code
message(paste0("Editor context: ", path))Code
cat("quarto version: "); quarto::quarto_version()quarto version:
[1] '1.7.29'
Code
sesion_info <- devtools::session_info()Warning in system2(“quarto”, “-V”, stdout = TRUE, env = paste0(“TMPDIR=”, : el comando ejecutado ‘“quarto” TMPDIR=C:/Users/andre/AppData/Local/Temp/Rtmp8qsQFn/filee92435a14788 -V’ tiene el estatus 1
Code
dplyr::select(
tibble::as_tibble(sesion_info$packages),
c(package, loadedversion, source)
) %>%
DT::datatable(filter = 'top', colnames = c('Row number' =1,'Package' = 2, 'Version'= 3),
caption = htmltools::tags$caption(
style = 'caption-side: top; text-align: left;',
'', htmltools::em('R packages')),
options=list(
initComplete = htmlwidgets::JS(
"function(settings, json) {",
"$(this.api().tables().body()).css({
'font-family': 'Helvetica Neue',
'font-size': '70%',
'code-inline-font-size': '15%',
'white-space': 'nowrap',
'line-height': '0.75em',
'min-height': '0.5em'
});",
"}")))Code
#|echo: true
#|error: true
#|message: true
#|paged.print: true
#| class-output: center-table
reticulate::py_list_packages() %>%
DT::datatable(filter = 'top', colnames = c('Row number' =1,'Package' = 2, 'Version'= 3),
caption = htmltools::tags$caption(
style = 'caption-side: top; text-align: left;',
'', htmltools::em('Python packages')),
options=list(
initComplete = htmlwidgets::JS(
"function(settings, json) {",
"$(this.api().tables().body()).css({
'font-family': 'Helvetica Neue',
'font-size': '70%',
'code-inline-font-size': '15%',
'white-space': 'nowrap',
'line-height': '0.75em',
'min-height': '0.5em'
});",
"}"))) Error in path.expand(path): argumento ‘path’ inválido
Save
Code
wdpath<-
paste0(gsub("/cons","",gsub("cons","",paste0(getwd(),"/cons"))))
envpath<- if(regmatches(wdpath, regexpr("[A-Za-z]+", wdpath))=="G"){"G:/Mi unidad/Alvacast/SISTRAT 2023/"}else{"E:/Mi unidad/Alvacast/SISTRAT 2023/"}
paste0(getwd(),"/cons")
file.path(paste0(wdpath,"data/20241015_out"))
file.path(paste0(envpath,"data/20241015_out"))
# Save
rdata_path <- file.path(wdpath, "data/20241015_out", paste0("3_ndp_", format(Sys.time(), "%Y_%m_%d"), ".Rdata"))
save.image(rdata_path)
cat("Saved in:",
rdata_path)
#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:
#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:
if (Sys.getenv("RSTUDIO_SESSION_TYPE") == "server" || file.exists("/.dockerenv")) {
password <- Sys.getenv("PASSWORD_SECRET")
} else {
if (interactive()) {
utils::savehistory(tempfile())
Sys.setenv(PASSWORD_SECRET = readLines(paste0(wdpath, "secret.txt"), warn = FALSE))
utils::loadhistory()
}
Sys.setenv(PASSWORD_SECRET = readLines(paste0(wdpath, "secret.txt"), warn = FALSE))
}
#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:
#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:
save.image(paste0(rdata_path,".enc"))
# Encriptar el archivo en el mismo lugar
httr2::secret_encrypt_file(path = paste0(rdata_path,".enc"), key = "PASSWORD_SECRET")
#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:
#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:
cat("Copy renv lock into cons folder\n")
if (Sys.getenv("RSTUDIO_SESSION_TYPE") == "server" || file.exists("/.dockerenv")) {
message("Running on RStudio Server or inside Docker. Folder copy skipped.")
} else {
source_folder <-
destination_folder <- paste0(wdpath,"cons/renv")
# Copy the folder recursively
file.copy(paste0(wdpath,"renv.lock"), paste0(wdpath,"cons/renv.lock"), overwrite = TRUE)
message("Renv lock copy performed.")
}Code
#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:
time_after_dedup1<-Sys.time()
paste0("Time in markdown: ");time_after_dedup1-time_before_dedup1[1] "G:/My Drive/Alvacast/SISTRAT 2023/cons/cons"
[1] "G:/My Drive/Alvacast/SISTRAT 2023//data/20241015_out"
[1] "G:/Mi unidad/Alvacast/SISTRAT 2023/data/20241015_out"
Saved in: G:/My Drive/Alvacast/SISTRAT 2023///data/20241015_out/3_ndp_2025_06_02.RdataCopy renv lock into cons folder
[1] "Time in markdown: "
Time difference of 40.12067 mins